|
|
""" |
|
|
PDF Parser for Bank Statements |
|
|
============================== |
|
|
|
|
|
Extract transactions from Indian bank statement PDFs. |
|
|
|
|
|
Supports: |
|
|
- HDFC Bank statements |
|
|
- ICICI Bank statements |
|
|
- SBI Bank statements |
|
|
- Axis Bank statements |
|
|
- And more... |
|
|
|
|
|
Author: Ranjit Behera |
|
|
""" |
|
|
|
|
|
import re |
|
|
from pathlib import Path |
|
|
from typing import List, Dict, Optional, Tuple |
|
|
from dataclasses import dataclass |
|
|
from datetime import datetime |
|
|
import io |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class PDFTransaction: |
|
|
"""Parsed transaction from PDF.""" |
|
|
date: str |
|
|
description: str |
|
|
amount: float |
|
|
type: str |
|
|
balance: Optional[float] = None |
|
|
reference: Optional[str] = None |
|
|
|
|
|
|
|
|
class BankStatementParser: |
|
|
""" |
|
|
Parse bank statement PDFs and extract transactions. |
|
|
|
|
|
Uses pdfplumber for text extraction and regex for parsing. |
|
|
""" |
|
|
|
|
|
|
|
|
BANK_PATTERNS = { |
|
|
"hdfc": { |
|
|
"header": r"HDFC\s+BANK", |
|
|
"date": r"(\d{2}/\d{2}/\d{2,4})", |
|
|
"transaction": r"(\d{2}/\d{2}/\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([DC]r)?\s*([\d,]+\.\d{2})?", |
|
|
}, |
|
|
"icici": { |
|
|
"header": r"ICICI\s+BANK", |
|
|
"date": r"(\d{2}-\w{3}-\d{2,4})", |
|
|
"transaction": r"(\d{2}-\w{3}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*(Dr|Cr)?\s*([\d,]+\.\d{2})?", |
|
|
}, |
|
|
"sbi": { |
|
|
"header": r"State\s+Bank\s+of\s+India", |
|
|
"date": r"(\d{2}\s+\w{3}\s+\d{2,4})", |
|
|
"transaction": r"(\d{2}\s+\w{3}\s+\d{4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?", |
|
|
}, |
|
|
"axis": { |
|
|
"header": r"AXIS\s+BANK", |
|
|
"date": r"(\d{2}-\d{2}-\d{2,4})", |
|
|
"transaction": r"(\d{2}-\d{2}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?", |
|
|
}, |
|
|
} |
|
|
|
|
|
def __init__(self): |
|
|
self.pdfplumber = None |
|
|
self._check_dependencies() |
|
|
|
|
|
def _check_dependencies(self): |
|
|
"""Check if pdfplumber is available.""" |
|
|
try: |
|
|
import pdfplumber |
|
|
self.pdfplumber = pdfplumber |
|
|
except ImportError: |
|
|
self.pdfplumber = None |
|
|
|
|
|
def parse_file(self, file_path: Path) -> List[PDFTransaction]: |
|
|
""" |
|
|
Parse a PDF file and extract transactions. |
|
|
|
|
|
Args: |
|
|
file_path: Path to PDF file |
|
|
|
|
|
Returns: |
|
|
List of extracted transactions |
|
|
""" |
|
|
if self.pdfplumber is None: |
|
|
raise ImportError("pdfplumber is required. Install with: pip install pdfplumber") |
|
|
|
|
|
with self.pdfplumber.open(file_path) as pdf: |
|
|
text = "" |
|
|
for page in pdf.pages: |
|
|
text += page.extract_text() or "" |
|
|
|
|
|
return self.parse_text(text) |
|
|
|
|
|
def parse_bytes(self, pdf_bytes: bytes) -> List[PDFTransaction]: |
|
|
""" |
|
|
Parse PDF from bytes. |
|
|
|
|
|
Args: |
|
|
pdf_bytes: PDF file content as bytes |
|
|
|
|
|
Returns: |
|
|
List of extracted transactions |
|
|
""" |
|
|
if self.pdfplumber is None: |
|
|
raise ImportError("pdfplumber is required. Install with: pip install pdfplumber") |
|
|
|
|
|
with self.pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: |
|
|
text = "" |
|
|
for page in pdf.pages: |
|
|
text += page.extract_text() or "" |
|
|
|
|
|
return self.parse_text(text) |
|
|
|
|
|
def parse_text(self, text: str) -> List[PDFTransaction]: |
|
|
""" |
|
|
Parse extracted text and identify transactions. |
|
|
|
|
|
Args: |
|
|
text: Extracted text from PDF |
|
|
|
|
|
Returns: |
|
|
List of transactions |
|
|
""" |
|
|
|
|
|
bank = self._detect_bank(text) |
|
|
|
|
|
if bank: |
|
|
return self._parse_with_pattern(text, bank) |
|
|
else: |
|
|
return self._parse_generic(text) |
|
|
|
|
|
def _detect_bank(self, text: str) -> Optional[str]: |
|
|
"""Detect which bank's statement this is.""" |
|
|
text_upper = text.upper() |
|
|
|
|
|
for bank, patterns in self.BANK_PATTERNS.items(): |
|
|
if re.search(patterns["header"], text_upper, re.IGNORECASE): |
|
|
return bank |
|
|
|
|
|
return None |
|
|
|
|
|
def _parse_with_pattern(self, text: str, bank: str) -> List[PDFTransaction]: |
|
|
"""Parse using bank-specific pattern.""" |
|
|
patterns = self.BANK_PATTERNS[bank] |
|
|
transactions = [] |
|
|
|
|
|
for match in re.finditer(patterns["transaction"], text, re.MULTILINE): |
|
|
try: |
|
|
date = match.group(1) |
|
|
description = match.group(2).strip() |
|
|
amount = float(match.group(3).replace(',', '')) |
|
|
|
|
|
|
|
|
txn_type = "debit" |
|
|
if len(match.groups()) > 3 and match.group(4): |
|
|
if match.group(4).upper() in ["CR", "C"]: |
|
|
txn_type = "credit" |
|
|
|
|
|
|
|
|
balance = None |
|
|
if len(match.groups()) > 4 and match.group(5): |
|
|
balance = float(match.group(5).replace(',', '')) |
|
|
|
|
|
|
|
|
reference = self._extract_reference(description) |
|
|
|
|
|
transactions.append(PDFTransaction( |
|
|
date=date, |
|
|
description=description, |
|
|
amount=amount, |
|
|
type=txn_type, |
|
|
balance=balance, |
|
|
reference=reference, |
|
|
)) |
|
|
except (ValueError, IndexError): |
|
|
continue |
|
|
|
|
|
return transactions |
|
|
|
|
|
def _parse_generic(self, text: str) -> List[PDFTransaction]: |
|
|
"""Generic parsing for unknown bank formats.""" |
|
|
transactions = [] |
|
|
|
|
|
|
|
|
pattern = r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})" |
|
|
|
|
|
for match in re.finditer(pattern, text, re.MULTILINE): |
|
|
try: |
|
|
date = match.group(1) |
|
|
description = match.group(2).strip() |
|
|
amount = float(match.group(3).replace(',', '')) |
|
|
|
|
|
|
|
|
txn_type = self._infer_type(description) |
|
|
reference = self._extract_reference(description) |
|
|
|
|
|
transactions.append(PDFTransaction( |
|
|
date=date, |
|
|
description=description, |
|
|
amount=amount, |
|
|
type=txn_type, |
|
|
reference=reference, |
|
|
)) |
|
|
except (ValueError, IndexError): |
|
|
continue |
|
|
|
|
|
return transactions |
|
|
|
|
|
def _extract_reference(self, description: str) -> Optional[str]: |
|
|
"""Extract reference number from description.""" |
|
|
patterns = [ |
|
|
r"[Rr]ef[.:# ]*(\d{10,18})", |
|
|
r"UTR[.:# ]*(\w{12,22})", |
|
|
r"IMPS[.:# ]*(\d{12})", |
|
|
r"NEFT[.:# ]*(\w{10,16})", |
|
|
] |
|
|
|
|
|
for pattern in patterns: |
|
|
match = re.search(pattern, description) |
|
|
if match: |
|
|
return match.group(1) |
|
|
|
|
|
return None |
|
|
|
|
|
def _infer_type(self, description: str) -> str: |
|
|
"""Infer transaction type from description.""" |
|
|
description_lower = description.lower() |
|
|
|
|
|
credit_keywords = ["salary", "credited", "received", "refund", "cashback", "interest"] |
|
|
debit_keywords = ["debited", "paid", "withdrawn", "transfer to", "payment"] |
|
|
|
|
|
for kw in credit_keywords: |
|
|
if kw in description_lower: |
|
|
return "credit" |
|
|
|
|
|
for kw in debit_keywords: |
|
|
if kw in description_lower: |
|
|
return "debit" |
|
|
|
|
|
return "debit" |
|
|
|
|
|
def to_dict_list(self, transactions: List[PDFTransaction]) -> List[Dict]: |
|
|
"""Convert transactions to list of dictionaries.""" |
|
|
return [ |
|
|
{ |
|
|
"date": t.date, |
|
|
"description": t.description, |
|
|
"amount": t.amount, |
|
|
"type": t.type, |
|
|
"balance": t.balance, |
|
|
"reference": t.reference, |
|
|
} |
|
|
for t in transactions |
|
|
] |
|
|
|
|
|
|
|
|
class ImageOCRParser: |
|
|
""" |
|
|
Parse transaction screenshots using OCR. |
|
|
|
|
|
Uses EasyOCR or pytesseract for text extraction. |
|
|
""" |
|
|
|
|
|
def __init__(self, backend: str = "auto"): |
|
|
""" |
|
|
Initialize OCR parser. |
|
|
|
|
|
Args: |
|
|
backend: "easyocr", "tesseract", or "auto" |
|
|
""" |
|
|
self.backend = backend |
|
|
self.reader = None |
|
|
self._init_backend() |
|
|
|
|
|
def _init_backend(self): |
|
|
"""Initialize OCR backend.""" |
|
|
if self.backend == "auto": |
|
|
try: |
|
|
import easyocr |
|
|
self.reader = easyocr.Reader(['en', 'hi']) |
|
|
self.backend = "easyocr" |
|
|
except ImportError: |
|
|
try: |
|
|
import pytesseract |
|
|
self.backend = "tesseract" |
|
|
except ImportError: |
|
|
raise ImportError("No OCR backend available. Install easyocr or pytesseract") |
|
|
|
|
|
elif self.backend == "easyocr": |
|
|
import easyocr |
|
|
self.reader = easyocr.Reader(['en', 'hi']) |
|
|
|
|
|
elif self.backend == "tesseract": |
|
|
import pytesseract |
|
|
|
|
|
def extract_text(self, image_path: Path) -> str: |
|
|
""" |
|
|
Extract text from image. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
|
|
|
Returns: |
|
|
Extracted text |
|
|
""" |
|
|
if self.backend == "easyocr": |
|
|
results = self.reader.readtext(str(image_path)) |
|
|
return "\n".join([r[1] for r in results]) |
|
|
|
|
|
elif self.backend == "tesseract": |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
|
|
|
image = Image.open(image_path) |
|
|
return pytesseract.image_to_string(image) |
|
|
|
|
|
return "" |
|
|
|
|
|
def extract_text_from_bytes(self, image_bytes: bytes) -> str: |
|
|
""" |
|
|
Extract text from image bytes. |
|
|
|
|
|
Args: |
|
|
image_bytes: Image content as bytes |
|
|
|
|
|
Returns: |
|
|
Extracted text |
|
|
""" |
|
|
if self.backend == "easyocr": |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
|
|
|
image = Image.open(io.BytesIO(image_bytes)) |
|
|
image_array = np.array(image) |
|
|
results = self.reader.readtext(image_array) |
|
|
return "\n".join([r[1] for r in results]) |
|
|
|
|
|
elif self.backend == "tesseract": |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
|
|
|
image = Image.open(io.BytesIO(image_bytes)) |
|
|
return pytesseract.image_to_string(image) |
|
|
|
|
|
return "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_pdf(file_path: str) -> List[Dict]: |
|
|
""" |
|
|
Convenience function to parse PDF. |
|
|
|
|
|
Args: |
|
|
file_path: Path to PDF file |
|
|
|
|
|
Returns: |
|
|
List of transaction dictionaries |
|
|
""" |
|
|
parser = BankStatementParser() |
|
|
transactions = parser.parse_file(Path(file_path)) |
|
|
return parser.to_dict_list(transactions) |
|
|
|
|
|
|
|
|
def parse_image(file_path: str) -> str: |
|
|
""" |
|
|
Convenience function to extract text from image. |
|
|
|
|
|
Args: |
|
|
file_path: Path to image file |
|
|
|
|
|
Returns: |
|
|
Extracted text |
|
|
""" |
|
|
parser = ImageOCRParser() |
|
|
return parser.extract_text(Path(file_path)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
|
|
|
if len(sys.argv) < 2: |
|
|
print("Usage: python pdf_parser.py <file.pdf>") |
|
|
sys.exit(1) |
|
|
|
|
|
file_path = sys.argv[1] |
|
|
|
|
|
if file_path.endswith('.pdf'): |
|
|
try: |
|
|
transactions = parse_pdf(file_path) |
|
|
print(f"Found {len(transactions)} transactions:") |
|
|
for t in transactions[:10]: |
|
|
print(f" {t['date']}: {t['type']} ₹{t['amount']:,.2f} - {t['description'][:40]}") |
|
|
except ImportError as e: |
|
|
print(f"Error: {e}") |
|
|
else: |
|
|
try: |
|
|
text = parse_image(file_path) |
|
|
print("Extracted text:") |
|
|
print(text) |
|
|
except ImportError as e: |
|
|
print(f"Error: {e}") |
|
|
|