Bank-Scrubber-v2 / src /services /bank_statement_service.py
Aryan Jain
update imports
2a728d0
import asyncio
import json
import pandas as pd
from typing import List, Dict, Any, Optional, Tuple
from ocr import PDFProcessor, TextExtractor
from extractor import TableExtractor, AccountExtractor, BalanceExtractor
from utils import GroqClient
from models.account_models import BankStatementData
from config.config import settings
class BankStatementService:
"""Main service for processing bank statements."""
def __init__(self):
self.pdf_processor = PDFProcessor()
self.table_extractor = TableExtractor()
self.account_extractor = AccountExtractor()
self.balance_extractor = BalanceExtractor()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_value, traceback):
pass
async def process_bank_statement(self, uploaded_file) -> BankStatementData:
"""Main method to process uploaded bank statement."""
# Save uploaded file
pdf_path = await self.pdf_processor.save_uploaded_file(uploaded_file)
# Check if PDF is scanned
pdf_scanned = await self.pdf_processor.is_pdf_scanned(pdf_path)
# Extract text based on PDF type
if pdf_scanned:
print(f"{pdf_path} is likely a scanned PDF.")
text_extractor = TextExtractor(self.pdf_processor.doctr_model)
extracted_text_list = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
else:
print(f"{pdf_path} is not a scanned PDF. Extracting text...")
text_extractor = TextExtractor(self.pdf_processor.doctr_model)
extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
# Process transaction tables
pre_processed_tables, table_tags = await self.table_extractor.process_transaction_tables_with_bbox(extracted_text_list)
# Clean and process tables
processed_tables = []
for table in pre_processed_tables:
processed_table = await self.table_extractor.process_tables(table)
processed_tables.append(processed_table)
# Organize tables by tags
final_table_dic = {}
for i, tag in enumerate(table_tags):
if tag not in final_table_dic:
final_table_dic[tag] = [processed_tables[i]]
else:
final_table_dic[tag].append(processed_tables[i])
# Concatenate tables with same tags
for tag, tables in final_table_dic.items():
final_table_dic[tag] = pd.concat(tables, ignore_index=True)
# Extract account information from first page
first_page = None
if pdf_scanned:
first_page = extracted_text_list
else:
first_page = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path, first_page=True)
first_page_object = first_page[0]
# Extract text for LLM processing
starting_text = ""
for lines in first_page_object:
starting_text += lines["line"]
# Extract account details using LLM
async with GroqClient() as groq_client:
bank_summary = await groq_client.extract_account_details(starting_text)
bank_summary = json.loads(bank_summary)
# Create account summary
account_summary = {
"Bank Name": bank_summary["bank_name"].upper(),
"Account Number": bank_summary["accounts"][-1]["account_number"],
"Starting Balance": str(bank_summary["accounts"][-1]["starting_balance"]),
"Ending Balance": str(bank_summary["accounts"][-1]["ending_balance"]),
"Statement Start Date": bank_summary["accounts"][-1]["statement_start_date"],
"Statement End Date": bank_summary["accounts"][-1]["statement_end_date"]
}
return BankStatementData(
account_summary=account_summary,
transaction_tables=final_table_dic
)