Finapp / pdf_processor.py
smainye's picture
Update pdf_processor.py
eb3ccfb verified
import pdfplumber
import pandas as pd
import re
from typing import List, Dict, Optional
import logging
from datetime import datetime
logger = logging.getLogger(__name__)
class PDFTransactionProcessor:
"""Process transaction data from PDF bank statements."""
def __init__(self):
self.transaction_patterns = {
'standard': re.compile(
r'(?P<date>\d{2}/\d{2}/\d{2,4})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})'
),
'bank_specific': {
'chase': re.compile(
r'(?P<date>\d{2}/\d{2})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})'
),
'bank_of_america': re.compile(
r'(?P<date>\d{2}/\d{2}/\d{4})\s+(?P<description>.+?)\s+(?P<amount>-?\d{1,3}(?:,\d{3})*\.\d{2})'
)
}
}
def extract_text_from_pdf(self, pdf_path: str) -> str:
"""Extract text from PDF file."""
try:
full_text = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
full_text.append(text)
return "\n".join(full_text)
except Exception as e:
logger.error(f"Error extracting text from PDF: {str(e)}")
raise
def identify_bank(self, text: str) -> Optional[str]:
"""Identify bank from PDF text."""
bank_indicators = {
'chase': ['chase', 'jpmorgan'],
'bank_of_america': ['bank of america', 'bofa'],
'wells_fargo': ['wells fargo'],
'citi': ['citibank', 'citi']
}
text_lower = text.lower()
for bank, indicators in bank_indicators.items():
if any(indicator in text_lower for indicator in indicators):
return bank
return None
def parse_transactions(self, text: str, bank: Optional[str] = None) -> List[Dict]:
"""Parse transactions from extracted text."""
transactions = []
lines = text.split('\n')
# Try bank-specific parser first
if bank and bank in self.transaction_patterns['bank_specific']:
pattern = self.transaction_patterns['bank_specific'][bank]
transactions = self._parse_with_pattern(lines, pattern)
# Fall back to standard parser if no matches found
if not transactions:
pattern = self.transaction_patterns['standard']
transactions = self._parse_with_pattern(lines, pattern)
return transactions
def _parse_with_pattern(self, lines: List[str], pattern) -> List[Dict]:
"""Helper method to parse transactions using a regex pattern."""
transactions = []
for line in lines:
match = pattern.search(line)
if match:
try:
amount = match.group('amount').replace('$', '').replace(',', '')
transactions.append({
'date': match.group('date'),
'description': match.group('description').strip(),
'amount': float(amount)
})
except (ValueError, AttributeError) as e:
logger.debug(f"Skipping line due to parsing error: {line}")
return transactions
def process_pdf(self, pdf_path: str) -> pd.DataFrame:
"""Process PDF file and return transactions as DataFrame."""
try:
text = self.extract_text_from_pdf(pdf_path)
bank = self.identify_bank(text)
transactions = self.parse_transactions(text, bank)
if not transactions:
raise ValueError("No transactions found in PDF")
df = pd.DataFrame(transactions)
# Clean and standardize data
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date', 'amount'])
df['description'] = df['description'].str.strip()
return df
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
raise