File size: 4,363 Bytes
01eb70f
 
eb3ccfb
 
01eb70f
eb3ccfb
01eb70f
 
 
 
eb3ccfb
 
01eb70f
eb3ccfb
 
 
 
 
 
 
 
 
 
 
 
 
ebbc133
eb3ccfb
 
ebbc133
eb3ccfb
 
 
 
 
 
 
ebbc133
eb3ccfb
 
ebbc133
eb3ccfb
 
 
 
 
 
 
01eb70f
 
eb3ccfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01eb70f
eb3ccfb
 
 
 
01eb70f
eb3ccfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9169304
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pdfplumber
import pandas as pd
import re
from typing import List, Dict, Optional
import logging
from datetime import datetime

logger = logging.getLogger(__name__)

class PDFTransactionProcessor:
    """Process transaction data from PDF bank statements."""
    
    def __init__(self):
        self.transaction_patterns = {
            'standard': re.compile(
                r'(?P<date>\d{2}/\d{2}/\d{2,4})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})'
            ),
            'bank_specific': {
                'chase': re.compile(
                    r'(?P<date>\d{2}/\d{2})\s+(?P<description>.+?)\s+(?P<amount>-?\$?\d{1,3}(?:,\d{3})*\.\d{2})'
                ),
                'bank_of_america': re.compile(
                    r'(?P<date>\d{2}/\d{2}/\d{4})\s+(?P<description>.+?)\s+(?P<amount>-?\d{1,3}(?:,\d{3})*\.\d{2})'
                )
            }
        }
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF file."""
        try:
            full_text = []
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        full_text.append(text)
            return "\n".join(full_text)
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {str(e)}")
            raise
    
    def identify_bank(self, text: str) -> Optional[str]:
        """Identify bank from PDF text."""
        bank_indicators = {
            'chase': ['chase', 'jpmorgan'],
            'bank_of_america': ['bank of america', 'bofa'],
            'wells_fargo': ['wells fargo'],
            'citi': ['citibank', 'citi']
        }
        
        text_lower = text.lower()
        for bank, indicators in bank_indicators.items():
            if any(indicator in text_lower for indicator in indicators):
                return bank
        return None
    
    def parse_transactions(self, text: str, bank: Optional[str] = None) -> List[Dict]:
        """Parse transactions from extracted text."""
        transactions = []
        lines = text.split('\n')
        
        # Try bank-specific parser first
        if bank and bank in self.transaction_patterns['bank_specific']:
            pattern = self.transaction_patterns['bank_specific'][bank]
            transactions = self._parse_with_pattern(lines, pattern)
        
        # Fall back to standard parser if no matches found
        if not transactions:
            pattern = self.transaction_patterns['standard']
            transactions = self._parse_with_pattern(lines, pattern)
        
        return transactions
    
    def _parse_with_pattern(self, lines: List[str], pattern) -> List[Dict]:
        """Helper method to parse transactions using a regex pattern."""
        transactions = []
        for line in lines:
            match = pattern.search(line)
            if match:
                try:
                    amount = match.group('amount').replace('$', '').replace(',', '')
                    transactions.append({
                        'date': match.group('date'),
                        'description': match.group('description').strip(),
                        'amount': float(amount)
                    })
                except (ValueError, AttributeError) as e:
                    logger.debug(f"Skipping line due to parsing error: {line}")
        return transactions
    
    def process_pdf(self, pdf_path: str) -> pd.DataFrame:
        """Process PDF file and return transactions as DataFrame."""
        try:
            text = self.extract_text_from_pdf(pdf_path)
            bank = self.identify_bank(text)
            transactions = self.parse_transactions(text, bank)
            
            if not transactions:
                raise ValueError("No transactions found in PDF")
                
            df = pd.DataFrame(transactions)
            
            # Clean and standardize data
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df = df.dropna(subset=['date', 'amount'])
            df['description'] = df['description'].str.strip()
            
            return df
            
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            raise