|
|
import pdfplumber |
|
|
import pandas as pd |
|
|
import re |
|
|
from typing import List, Dict, Optional |
|
|
import logging |
|
|
from datetime import datetime |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class PDFTransactionProcessor: |
|
|
"""Process transaction data from PDF bank statements.""" |
|
|
|
|
|
def __init__(self): |
|
|
self.transaction_patterns = { |
|
|
'standard': re.compile( |
|
|
r'(?P<date>\d{2}/\d{2}/\d{2,4})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})' |
|
|
), |
|
|
'bank_specific': { |
|
|
'chase': re.compile( |
|
|
r'(?P<date>\d{2}/\d{2})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})' |
|
|
), |
|
|
'bank_of_america': re.compile( |
|
|
r'(?P<date>\d{2}/\d{2}/\d{4})\s+(?P<description>.+?)\s+(?P<amount>-?\d{1,3}(?:,\d{3})*\.\d{2})' |
|
|
) |
|
|
} |
|
|
} |
|
|
|
|
|
def extract_text_from_pdf(self, pdf_path: str) -> str: |
|
|
"""Extract text from PDF file.""" |
|
|
try: |
|
|
full_text = [] |
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
|
for page in pdf.pages: |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
full_text.append(text) |
|
|
return "\n".join(full_text) |
|
|
except Exception as e: |
|
|
logger.error(f"Error extracting text from PDF: {str(e)}") |
|
|
raise |
|
|
|
|
|
def identify_bank(self, text: str) -> Optional[str]: |
|
|
"""Identify bank from PDF text.""" |
|
|
bank_indicators = { |
|
|
'chase': ['chase', 'jpmorgan'], |
|
|
'bank_of_america': ['bank of america', 'bofa'], |
|
|
'wells_fargo': ['wells fargo'], |
|
|
'citi': ['citibank', 'citi'] |
|
|
} |
|
|
|
|
|
text_lower = text.lower() |
|
|
for bank, indicators in bank_indicators.items(): |
|
|
if any(indicator in text_lower for indicator in indicators): |
|
|
return bank |
|
|
return None |
|
|
|
|
|
def parse_transactions(self, text: str, bank: Optional[str] = None) -> List[Dict]: |
|
|
"""Parse transactions from extracted text.""" |
|
|
transactions = [] |
|
|
lines = text.split('\n') |
|
|
|
|
|
|
|
|
if bank and bank in self.transaction_patterns['bank_specific']: |
|
|
pattern = self.transaction_patterns['bank_specific'][bank] |
|
|
transactions = self._parse_with_pattern(lines, pattern) |
|
|
|
|
|
|
|
|
if not transactions: |
|
|
pattern = self.transaction_patterns['standard'] |
|
|
transactions = self._parse_with_pattern(lines, pattern) |
|
|
|
|
|
return transactions |
|
|
|
|
|
def _parse_with_pattern(self, lines: List[str], pattern) -> List[Dict]: |
|
|
"""Helper method to parse transactions using a regex pattern.""" |
|
|
transactions = [] |
|
|
for line in lines: |
|
|
match = pattern.search(line) |
|
|
if match: |
|
|
try: |
|
|
amount = match.group('amount').replace('$', '').replace(',', '') |
|
|
transactions.append({ |
|
|
'date': match.group('date'), |
|
|
'description': match.group('description').strip(), |
|
|
'amount': float(amount) |
|
|
}) |
|
|
except (ValueError, AttributeError) as e: |
|
|
logger.debug(f"Skipping line due to parsing error: {line}") |
|
|
return transactions |
|
|
|
|
|
def process_pdf(self, pdf_path: str) -> pd.DataFrame: |
|
|
"""Process PDF file and return transactions as DataFrame.""" |
|
|
try: |
|
|
text = self.extract_text_from_pdf(pdf_path) |
|
|
bank = self.identify_bank(text) |
|
|
transactions = self.parse_transactions(text, bank) |
|
|
|
|
|
if not transactions: |
|
|
raise ValueError("No transactions found in PDF") |
|
|
|
|
|
df = pd.DataFrame(transactions) |
|
|
|
|
|
|
|
|
df['date'] = pd.to_datetime(df['date'], errors='coerce') |
|
|
df = df.dropna(subset=['date', 'amount']) |
|
|
df['description'] = df['description'].str.strip() |
|
|
|
|
|
return df |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error processing PDF: {str(e)}") |
|
|
raise |
|
|
|
|
|
|