File size: 12,749 Bytes

f60e9c2

"""
PDF Parser for Bank Statements
==============================

Extract transactions from Indian bank statement PDFs.

Supports:
- HDFC Bank statements
- ICICI Bank statements
- SBI Bank statements
- Axis Bank statements
- And more...

Author: Ranjit Behera
"""

import re
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime
import io


@dataclass
class PDFTransaction:
    """Parsed transaction from PDF."""
    date: str
    description: str
    amount: float
    type: str  # debit or credit
    balance: Optional[float] = None
    reference: Optional[str] = None


class BankStatementParser:
    """
    Parse bank statement PDFs and extract transactions.
    
    Uses pdfplumber for text extraction and regex for parsing.
    """
    
    # Bank-specific patterns
    BANK_PATTERNS = {
        "hdfc": {
            "header": r"HDFC\s+BANK",
            "date": r"(\d{2}/\d{2}/\d{2,4})",
            "transaction": r"(\d{2}/\d{2}/\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([DC]r)?\s*([\d,]+\.\d{2})?",
        },
        "icici": {
            "header": r"ICICI\s+BANK",
            "date": r"(\d{2}-\w{3}-\d{2,4})",
            "transaction": r"(\d{2}-\w{3}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*(Dr|Cr)?\s*([\d,]+\.\d{2})?",
        },
        "sbi": {
            "header": r"State\s+Bank\s+of\s+India",
            "date": r"(\d{2}\s+\w{3}\s+\d{2,4})",
            "transaction": r"(\d{2}\s+\w{3}\s+\d{4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
        },
        "axis": {
            "header": r"AXIS\s+BANK",
            "date": r"(\d{2}-\d{2}-\d{2,4})",
            "transaction": r"(\d{2}-\d{2}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
        },
    }
    
    def __init__(self):
        self.pdfplumber = None
        self._check_dependencies()
    
    def _check_dependencies(self):
        """Check if pdfplumber is available."""
        try:
            import pdfplumber
            self.pdfplumber = pdfplumber
        except ImportError:
            self.pdfplumber = None
    
    def parse_file(self, file_path: Path) -> List[PDFTransaction]:
        """
        Parse a PDF file and extract transactions.
        
        Args:
            file_path: Path to PDF file
            
        Returns:
            List of extracted transactions
        """
        if self.pdfplumber is None:
            raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
        
        with self.pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
        
        return self.parse_text(text)
    
    def parse_bytes(self, pdf_bytes: bytes) -> List[PDFTransaction]:
        """
        Parse PDF from bytes.
        
        Args:
            pdf_bytes: PDF file content as bytes
            
        Returns:
            List of extracted transactions
        """
        if self.pdfplumber is None:
            raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
        
        with self.pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
        
        return self.parse_text(text)
    
    def parse_text(self, text: str) -> List[PDFTransaction]:
        """
        Parse extracted text and identify transactions.
        
        Args:
            text: Extracted text from PDF
            
        Returns:
            List of transactions
        """
        # Detect bank
        bank = self._detect_bank(text)
        
        if bank:
            return self._parse_with_pattern(text, bank)
        else:
            return self._parse_generic(text)
    
    def _detect_bank(self, text: str) -> Optional[str]:
        """Detect which bank's statement this is."""
        text_upper = text.upper()
        
        for bank, patterns in self.BANK_PATTERNS.items():
            if re.search(patterns["header"], text_upper, re.IGNORECASE):
                return bank
        
        return None
    
    def _parse_with_pattern(self, text: str, bank: str) -> List[PDFTransaction]:
        """Parse using bank-specific pattern."""
        patterns = self.BANK_PATTERNS[bank]
        transactions = []
        
        for match in re.finditer(patterns["transaction"], text, re.MULTILINE):
            try:
                date = match.group(1)
                description = match.group(2).strip()
                amount = float(match.group(3).replace(',', ''))
                
                # Determine type
                txn_type = "debit"
                if len(match.groups()) > 3 and match.group(4):
                    if match.group(4).upper() in ["CR", "C"]:
                        txn_type = "credit"
                
                # Extract balance if present
                balance = None
                if len(match.groups()) > 4 and match.group(5):
                    balance = float(match.group(5).replace(',', ''))
                
                # Extract reference from description
                reference = self._extract_reference(description)
                
                transactions.append(PDFTransaction(
                    date=date,
                    description=description,
                    amount=amount,
                    type=txn_type,
                    balance=balance,
                    reference=reference,
                ))
            except (ValueError, IndexError):
                continue
        
        return transactions
    
    def _parse_generic(self, text: str) -> List[PDFTransaction]:
        """Generic parsing for unknown bank formats."""
        transactions = []
        
        # Generic pattern: date, description, amount
        pattern = r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})"
        
        for match in re.finditer(pattern, text, re.MULTILINE):
            try:
                date = match.group(1)
                description = match.group(2).strip()
                amount = float(match.group(3).replace(',', ''))
                
                # Infer type from description
                txn_type = self._infer_type(description)
                reference = self._extract_reference(description)
                
                transactions.append(PDFTransaction(
                    date=date,
                    description=description,
                    amount=amount,
                    type=txn_type,
                    reference=reference,
                ))
            except (ValueError, IndexError):
                continue
        
        return transactions
    
    def _extract_reference(self, description: str) -> Optional[str]:
        """Extract reference number from description."""
        patterns = [
            r"[Rr]ef[.:# ]*(\d{10,18})",
            r"UTR[.:# ]*(\w{12,22})",
            r"IMPS[.:# ]*(\d{12})",
            r"NEFT[.:# ]*(\w{10,16})",
        ]
        
        for pattern in patterns:
            match = re.search(pattern, description)
            if match:
                return match.group(1)
        
        return None
    
    def _infer_type(self, description: str) -> str:
        """Infer transaction type from description."""
        description_lower = description.lower()
        
        credit_keywords = ["salary", "credited", "received", "refund", "cashback", "interest"]
        debit_keywords = ["debited", "paid", "withdrawn", "transfer to", "payment"]
        
        for kw in credit_keywords:
            if kw in description_lower:
                return "credit"
        
        for kw in debit_keywords:
            if kw in description_lower:
                return "debit"
        
        return "debit"  # Default to debit
    
    def to_dict_list(self, transactions: List[PDFTransaction]) -> List[Dict]:
        """Convert transactions to list of dictionaries."""
        return [
            {
                "date": t.date,
                "description": t.description,
                "amount": t.amount,
                "type": t.type,
                "balance": t.balance,
                "reference": t.reference,
            }
            for t in transactions
        ]


class ImageOCRParser:
    """
    Parse transaction screenshots using OCR.
    
    Uses EasyOCR or pytesseract for text extraction.
    """
    
    def __init__(self, backend: str = "auto"):
        """
        Initialize OCR parser.
        
        Args:
            backend: "easyocr", "tesseract", or "auto"
        """
        self.backend = backend
        self.reader = None
        self._init_backend()
    
    def _init_backend(self):
        """Initialize OCR backend."""
        if self.backend == "auto":
            try:
                import easyocr
                self.reader = easyocr.Reader(['en', 'hi'])
                self.backend = "easyocr"
            except ImportError:
                try:
                    import pytesseract
                    self.backend = "tesseract"
                except ImportError:
                    raise ImportError("No OCR backend available. Install easyocr or pytesseract")
        
        elif self.backend == "easyocr":
            import easyocr
            self.reader = easyocr.Reader(['en', 'hi'])
        
        elif self.backend == "tesseract":
            import pytesseract
    
    def extract_text(self, image_path: Path) -> str:
        """
        Extract text from image.
        
        Args:
            image_path: Path to image file
            
        Returns:
            Extracted text
        """
        if self.backend == "easyocr":
            results = self.reader.readtext(str(image_path))
            return "\n".join([r[1] for r in results])
        
        elif self.backend == "tesseract":
            import pytesseract
            from PIL import Image
            
            image = Image.open(image_path)
            return pytesseract.image_to_string(image)
        
        return ""
    
    def extract_text_from_bytes(self, image_bytes: bytes) -> str:
        """
        Extract text from image bytes.
        
        Args:
            image_bytes: Image content as bytes
            
        Returns:
            Extracted text
        """
        if self.backend == "easyocr":
            import numpy as np
            from PIL import Image
            
            image = Image.open(io.BytesIO(image_bytes))
            image_array = np.array(image)
            results = self.reader.readtext(image_array)
            return "\n".join([r[1] for r in results])
        
        elif self.backend == "tesseract":
            import pytesseract
            from PIL import Image
            
            image = Image.open(io.BytesIO(image_bytes))
            return pytesseract.image_to_string(image)
        
        return ""


# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def parse_pdf(file_path: str) -> List[Dict]:
    """
    Convenience function to parse PDF.
    
    Args:
        file_path: Path to PDF file
        
    Returns:
        List of transaction dictionaries
    """
    parser = BankStatementParser()
    transactions = parser.parse_file(Path(file_path))
    return parser.to_dict_list(transactions)


def parse_image(file_path: str) -> str:
    """
    Convenience function to extract text from image.
    
    Args:
        file_path: Path to image file
        
    Returns:
        Extracted text
    """
    parser = ImageOCRParser()
    return parser.extract_text(Path(file_path))


# ============================================================================
# MAIN
# ============================================================================

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) < 2:
        print("Usage: python pdf_parser.py <file.pdf>")
        sys.exit(1)
    
    file_path = sys.argv[1]
    
    if file_path.endswith('.pdf'):
        try:
            transactions = parse_pdf(file_path)
            print(f"Found {len(transactions)} transactions:")
            for t in transactions[:10]:
                print(f"  {t['date']}: {t['type']} ₹{t['amount']:,.2f} - {t['description'][:40]}")
        except ImportError as e:
            print(f"Error: {e}")
    else:
        try:
            text = parse_image(file_path)
            print("Extracted text:")
            print(text)
        except ImportError as e:
            print(f"Error: {e}")