import pdfplumber import pandas as pd import re from typing import List, Dict, Optional import logging from datetime import datetime logger = logging.getLogger(__name__) class PDFTransactionProcessor: """Process transaction data from PDF bank statements.""" def __init__(self): self.transaction_patterns = { 'standard': re.compile( r'(?P\d{2}/\d{2}/\d{2,4})\s+(?P.+?)\s+(?P-?\$?\d{1,3}(?:,\d{3})*\.\d{2})' ), 'bank_specific': { 'chase': re.compile( r'(?P\d{2}/\d{2})\s+(?P.+?)\s+(?P-?\$?\d{1,3}(?:,\d{3})*\.\d{2})' ), 'bank_of_america': re.compile( r'(?P\d{2}/\d{2}/\d{4})\s+(?P.+?)\s+(?P-?\d{1,3}(?:,\d{3})*\.\d{2})' ) } } def extract_text_from_pdf(self, pdf_path: str) -> str: """Extract text from PDF file.""" try: full_text = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() if text: full_text.append(text) return "\n".join(full_text) except Exception as e: logger.error(f"Error extracting text from PDF: {str(e)}") raise def identify_bank(self, text: str) -> Optional[str]: """Identify bank from PDF text.""" bank_indicators = { 'chase': ['chase', 'jpmorgan'], 'bank_of_america': ['bank of america', 'bofa'], 'wells_fargo': ['wells fargo'], 'citi': ['citibank', 'citi'] } text_lower = text.lower() for bank, indicators in bank_indicators.items(): if any(indicator in text_lower for indicator in indicators): return bank return None def parse_transactions(self, text: str, bank: Optional[str] = None) -> List[Dict]: """Parse transactions from extracted text.""" transactions = [] lines = text.split('\n') # Try bank-specific parser first if bank and bank in self.transaction_patterns['bank_specific']: pattern = self.transaction_patterns['bank_specific'][bank] transactions = self._parse_with_pattern(lines, pattern) # Fall back to standard parser if no matches found if not transactions: pattern = self.transaction_patterns['standard'] transactions = self._parse_with_pattern(lines, pattern) return transactions def _parse_with_pattern(self, lines: List[str], pattern) -> List[Dict]: """Helper method to parse transactions using a regex pattern.""" transactions = [] for line in lines: match = pattern.search(line) if match: try: amount = match.group('amount').replace('$', '').replace(',', '') transactions.append({ 'date': match.group('date'), 'description': match.group('description').strip(), 'amount': float(amount) }) except (ValueError, AttributeError) as e: logger.debug(f"Skipping line due to parsing error: {line}") return transactions def process_pdf(self, pdf_path: str) -> pd.DataFrame: """Process PDF file and return transactions as DataFrame.""" try: text = self.extract_text_from_pdf(pdf_path) bank = self.identify_bank(text) transactions = self.parse_transactions(text, bank) if not transactions: raise ValueError("No transactions found in PDF") df = pd.DataFrame(transactions) # Clean and standardize data df['date'] = pd.to_datetime(df['date'], errors='coerce') df = df.dropna(subset=['date', 'amount']) df['description'] = df['description'].str.strip() return df except Exception as e: logger.error(f"Error processing PDF: {str(e)}") raise