""" PDF Parser for Bank Statements ============================== Extract transactions from Indian bank statement PDFs. Supports: - HDFC Bank statements - ICICI Bank statements - SBI Bank statements - Axis Bank statements - And more... Author: Ranjit Behera """ import re from pathlib import Path from typing import List, Dict, Optional, Tuple from dataclasses import dataclass from datetime import datetime import io @dataclass class PDFTransaction: """Parsed transaction from PDF.""" date: str description: str amount: float type: str # debit or credit balance: Optional[float] = None reference: Optional[str] = None class BankStatementParser: """ Parse bank statement PDFs and extract transactions. Uses pdfplumber for text extraction and regex for parsing. """ # Bank-specific patterns BANK_PATTERNS = { "hdfc": { "header": r"HDFC\s+BANK", "date": r"(\d{2}/\d{2}/\d{2,4})", "transaction": r"(\d{2}/\d{2}/\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([DC]r)?\s*([\d,]+\.\d{2})?", }, "icici": { "header": r"ICICI\s+BANK", "date": r"(\d{2}-\w{3}-\d{2,4})", "transaction": r"(\d{2}-\w{3}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*(Dr|Cr)?\s*([\d,]+\.\d{2})?", }, "sbi": { "header": r"State\s+Bank\s+of\s+India", "date": r"(\d{2}\s+\w{3}\s+\d{2,4})", "transaction": r"(\d{2}\s+\w{3}\s+\d{4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?", }, "axis": { "header": r"AXIS\s+BANK", "date": r"(\d{2}-\d{2}-\d{2,4})", "transaction": r"(\d{2}-\d{2}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?", }, } def __init__(self): self.pdfplumber = None self._check_dependencies() def _check_dependencies(self): """Check if pdfplumber is available.""" try: import pdfplumber self.pdfplumber = pdfplumber except ImportError: self.pdfplumber = None def parse_file(self, file_path: Path) -> List[PDFTransaction]: """ Parse a PDF file and extract transactions. Args: file_path: Path to PDF file Returns: List of extracted transactions """ if self.pdfplumber is None: raise ImportError("pdfplumber is required. Install with: pip install pdfplumber") with self.pdfplumber.open(file_path) as pdf: text = "" for page in pdf.pages: text += page.extract_text() or "" return self.parse_text(text) def parse_bytes(self, pdf_bytes: bytes) -> List[PDFTransaction]: """ Parse PDF from bytes. Args: pdf_bytes: PDF file content as bytes Returns: List of extracted transactions """ if self.pdfplumber is None: raise ImportError("pdfplumber is required. Install with: pip install pdfplumber") with self.pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: text = "" for page in pdf.pages: text += page.extract_text() or "" return self.parse_text(text) def parse_text(self, text: str) -> List[PDFTransaction]: """ Parse extracted text and identify transactions. Args: text: Extracted text from PDF Returns: List of transactions """ # Detect bank bank = self._detect_bank(text) if bank: return self._parse_with_pattern(text, bank) else: return self._parse_generic(text) def _detect_bank(self, text: str) -> Optional[str]: """Detect which bank's statement this is.""" text_upper = text.upper() for bank, patterns in self.BANK_PATTERNS.items(): if re.search(patterns["header"], text_upper, re.IGNORECASE): return bank return None def _parse_with_pattern(self, text: str, bank: str) -> List[PDFTransaction]: """Parse using bank-specific pattern.""" patterns = self.BANK_PATTERNS[bank] transactions = [] for match in re.finditer(patterns["transaction"], text, re.MULTILINE): try: date = match.group(1) description = match.group(2).strip() amount = float(match.group(3).replace(',', '')) # Determine type txn_type = "debit" if len(match.groups()) > 3 and match.group(4): if match.group(4).upper() in ["CR", "C"]: txn_type = "credit" # Extract balance if present balance = None if len(match.groups()) > 4 and match.group(5): balance = float(match.group(5).replace(',', '')) # Extract reference from description reference = self._extract_reference(description) transactions.append(PDFTransaction( date=date, description=description, amount=amount, type=txn_type, balance=balance, reference=reference, )) except (ValueError, IndexError): continue return transactions def _parse_generic(self, text: str) -> List[PDFTransaction]: """Generic parsing for unknown bank formats.""" transactions = [] # Generic pattern: date, description, amount pattern = r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})" for match in re.finditer(pattern, text, re.MULTILINE): try: date = match.group(1) description = match.group(2).strip() amount = float(match.group(3).replace(',', '')) # Infer type from description txn_type = self._infer_type(description) reference = self._extract_reference(description) transactions.append(PDFTransaction( date=date, description=description, amount=amount, type=txn_type, reference=reference, )) except (ValueError, IndexError): continue return transactions def _extract_reference(self, description: str) -> Optional[str]: """Extract reference number from description.""" patterns = [ r"[Rr]ef[.:# ]*(\d{10,18})", r"UTR[.:# ]*(\w{12,22})", r"IMPS[.:# ]*(\d{12})", r"NEFT[.:# ]*(\w{10,16})", ] for pattern in patterns: match = re.search(pattern, description) if match: return match.group(1) return None def _infer_type(self, description: str) -> str: """Infer transaction type from description.""" description_lower = description.lower() credit_keywords = ["salary", "credited", "received", "refund", "cashback", "interest"] debit_keywords = ["debited", "paid", "withdrawn", "transfer to", "payment"] for kw in credit_keywords: if kw in description_lower: return "credit" for kw in debit_keywords: if kw in description_lower: return "debit" return "debit" # Default to debit def to_dict_list(self, transactions: List[PDFTransaction]) -> List[Dict]: """Convert transactions to list of dictionaries.""" return [ { "date": t.date, "description": t.description, "amount": t.amount, "type": t.type, "balance": t.balance, "reference": t.reference, } for t in transactions ] class ImageOCRParser: """ Parse transaction screenshots using OCR. Uses EasyOCR or pytesseract for text extraction. """ def __init__(self, backend: str = "auto"): """ Initialize OCR parser. Args: backend: "easyocr", "tesseract", or "auto" """ self.backend = backend self.reader = None self._init_backend() def _init_backend(self): """Initialize OCR backend.""" if self.backend == "auto": try: import easyocr self.reader = easyocr.Reader(['en', 'hi']) self.backend = "easyocr" except ImportError: try: import pytesseract self.backend = "tesseract" except ImportError: raise ImportError("No OCR backend available. Install easyocr or pytesseract") elif self.backend == "easyocr": import easyocr self.reader = easyocr.Reader(['en', 'hi']) elif self.backend == "tesseract": import pytesseract def extract_text(self, image_path: Path) -> str: """ Extract text from image. Args: image_path: Path to image file Returns: Extracted text """ if self.backend == "easyocr": results = self.reader.readtext(str(image_path)) return "\n".join([r[1] for r in results]) elif self.backend == "tesseract": import pytesseract from PIL import Image image = Image.open(image_path) return pytesseract.image_to_string(image) return "" def extract_text_from_bytes(self, image_bytes: bytes) -> str: """ Extract text from image bytes. Args: image_bytes: Image content as bytes Returns: Extracted text """ if self.backend == "easyocr": import numpy as np from PIL import Image image = Image.open(io.BytesIO(image_bytes)) image_array = np.array(image) results = self.reader.readtext(image_array) return "\n".join([r[1] for r in results]) elif self.backend == "tesseract": import pytesseract from PIL import Image image = Image.open(io.BytesIO(image_bytes)) return pytesseract.image_to_string(image) return "" # ============================================================================ # UTILITY FUNCTIONS # ============================================================================ def parse_pdf(file_path: str) -> List[Dict]: """ Convenience function to parse PDF. Args: file_path: Path to PDF file Returns: List of transaction dictionaries """ parser = BankStatementParser() transactions = parser.parse_file(Path(file_path)) return parser.to_dict_list(transactions) def parse_image(file_path: str) -> str: """ Convenience function to extract text from image. Args: file_path: Path to image file Returns: Extracted text """ parser = ImageOCRParser() return parser.extract_text(Path(file_path)) # ============================================================================ # MAIN # ============================================================================ if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python pdf_parser.py ") sys.exit(1) file_path = sys.argv[1] if file_path.endswith('.pdf'): try: transactions = parse_pdf(file_path) print(f"Found {len(transactions)} transactions:") for t in transactions[:10]: print(f" {t['date']}: {t['type']} ₹{t['amount']:,.2f} - {t['description'][:40]}") except ImportError as e: print(f"Error: {e}") else: try: text = parse_image(file_path) print("Extracted text:") print(text) except ImportError as e: print(f"Error: {e}")