Spaces:
Sleeping
Sleeping
| import asyncio | |
| import json | |
| import pandas as pd | |
| from typing import List, Dict, Any, Optional, Tuple | |
| from ocr import PDFProcessor, TextExtractor | |
| from extractor import TableExtractor, AccountExtractor, BalanceExtractor | |
| from utils import GroqClient | |
| from models.account_models import BankStatementData | |
| from config.config import settings | |
| class BankStatementService: | |
| """Main service for processing bank statements.""" | |
| def __init__(self): | |
| self.pdf_processor = PDFProcessor() | |
| self.table_extractor = TableExtractor() | |
| self.account_extractor = AccountExtractor() | |
| self.balance_extractor = BalanceExtractor() | |
| async def __aenter__(self): | |
| return self | |
| async def __aexit__(self, exc_type, exc_value, traceback): | |
| pass | |
| async def process_bank_statement(self, uploaded_file) -> BankStatementData: | |
| """Main method to process uploaded bank statement.""" | |
| # Save uploaded file | |
| pdf_path = await self.pdf_processor.save_uploaded_file(uploaded_file) | |
| # Check if PDF is scanned | |
| pdf_scanned = await self.pdf_processor.is_pdf_scanned(pdf_path) | |
| # Extract text based on PDF type | |
| if pdf_scanned: | |
| print(f"{pdf_path} is likely a scanned PDF.") | |
| text_extractor = TextExtractor(self.pdf_processor.doctr_model) | |
| extracted_text_list = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path) | |
| else: | |
| print(f"{pdf_path} is not a scanned PDF. Extracting text...") | |
| text_extractor = TextExtractor(self.pdf_processor.doctr_model) | |
| extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path) | |
| # Process transaction tables | |
| pre_processed_tables, table_tags = await self.table_extractor.process_transaction_tables_with_bbox(extracted_text_list) | |
| # Clean and process tables | |
| processed_tables = [] | |
| for table in pre_processed_tables: | |
| processed_table = await self.table_extractor.process_tables(table) | |
| processed_tables.append(processed_table) | |
| # Organize tables by tags | |
| final_table_dic = {} | |
| for i, tag in enumerate(table_tags): | |
| if tag not in final_table_dic: | |
| final_table_dic[tag] = [processed_tables[i]] | |
| else: | |
| final_table_dic[tag].append(processed_tables[i]) | |
| # Concatenate tables with same tags | |
| for tag, tables in final_table_dic.items(): | |
| final_table_dic[tag] = pd.concat(tables, ignore_index=True) | |
| # Extract account information from first page | |
| first_page = None | |
| if pdf_scanned: | |
| first_page = extracted_text_list | |
| else: | |
| first_page = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path, first_page=True) | |
| first_page_object = first_page[0] | |
| # Extract text for LLM processing | |
| starting_text = "" | |
| for lines in first_page_object: | |
| starting_text += lines["line"] | |
| # Extract account details using LLM | |
| async with GroqClient() as groq_client: | |
| bank_summary = await groq_client.extract_account_details(starting_text) | |
| bank_summary = json.loads(bank_summary) | |
| # Create account summary | |
| account_summary = { | |
| "Bank Name": bank_summary["bank_name"].upper(), | |
| "Account Number": bank_summary["accounts"][-1]["account_number"], | |
| "Starting Balance": str(bank_summary["accounts"][-1]["starting_balance"]), | |
| "Ending Balance": str(bank_summary["accounts"][-1]["ending_balance"]), | |
| "Statement Start Date": bank_summary["accounts"][-1]["statement_start_date"], | |
| "Statement End Date": bank_summary["accounts"][-1]["statement_end_date"] | |
| } | |
| return BankStatementData( | |
| account_summary=account_summary, | |
| transaction_tables=final_table_dic | |
| ) |