Ranjit0034's picture
Upload src/finee/pdf_parser.py with huggingface_hub
f60e9c2 verified
"""
PDF Parser for Bank Statements
==============================
Extract transactions from Indian bank statement PDFs.
Supports:
- HDFC Bank statements
- ICICI Bank statements
- SBI Bank statements
- Axis Bank statements
- And more...
Author: Ranjit Behera
"""
import re
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime
import io
@dataclass
class PDFTransaction:
"""Parsed transaction from PDF."""
date: str
description: str
amount: float
type: str # debit or credit
balance: Optional[float] = None
reference: Optional[str] = None
class BankStatementParser:
"""
Parse bank statement PDFs and extract transactions.
Uses pdfplumber for text extraction and regex for parsing.
"""
# Bank-specific patterns
BANK_PATTERNS = {
"hdfc": {
"header": r"HDFC\s+BANK",
"date": r"(\d{2}/\d{2}/\d{2,4})",
"transaction": r"(\d{2}/\d{2}/\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([DC]r)?\s*([\d,]+\.\d{2})?",
},
"icici": {
"header": r"ICICI\s+BANK",
"date": r"(\d{2}-\w{3}-\d{2,4})",
"transaction": r"(\d{2}-\w{3}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*(Dr|Cr)?\s*([\d,]+\.\d{2})?",
},
"sbi": {
"header": r"State\s+Bank\s+of\s+India",
"date": r"(\d{2}\s+\w{3}\s+\d{2,4})",
"transaction": r"(\d{2}\s+\w{3}\s+\d{4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
},
"axis": {
"header": r"AXIS\s+BANK",
"date": r"(\d{2}-\d{2}-\d{2,4})",
"transaction": r"(\d{2}-\d{2}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
},
}
def __init__(self):
self.pdfplumber = None
self._check_dependencies()
def _check_dependencies(self):
"""Check if pdfplumber is available."""
try:
import pdfplumber
self.pdfplumber = pdfplumber
except ImportError:
self.pdfplumber = None
def parse_file(self, file_path: Path) -> List[PDFTransaction]:
"""
Parse a PDF file and extract transactions.
Args:
file_path: Path to PDF file
Returns:
List of extracted transactions
"""
if self.pdfplumber is None:
raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
with self.pdfplumber.open(file_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
return self.parse_text(text)
def parse_bytes(self, pdf_bytes: bytes) -> List[PDFTransaction]:
"""
Parse PDF from bytes.
Args:
pdf_bytes: PDF file content as bytes
Returns:
List of extracted transactions
"""
if self.pdfplumber is None:
raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
with self.pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
return self.parse_text(text)
def parse_text(self, text: str) -> List[PDFTransaction]:
"""
Parse extracted text and identify transactions.
Args:
text: Extracted text from PDF
Returns:
List of transactions
"""
# Detect bank
bank = self._detect_bank(text)
if bank:
return self._parse_with_pattern(text, bank)
else:
return self._parse_generic(text)
def _detect_bank(self, text: str) -> Optional[str]:
"""Detect which bank's statement this is."""
text_upper = text.upper()
for bank, patterns in self.BANK_PATTERNS.items():
if re.search(patterns["header"], text_upper, re.IGNORECASE):
return bank
return None
def _parse_with_pattern(self, text: str, bank: str) -> List[PDFTransaction]:
"""Parse using bank-specific pattern."""
patterns = self.BANK_PATTERNS[bank]
transactions = []
for match in re.finditer(patterns["transaction"], text, re.MULTILINE):
try:
date = match.group(1)
description = match.group(2).strip()
amount = float(match.group(3).replace(',', ''))
# Determine type
txn_type = "debit"
if len(match.groups()) > 3 and match.group(4):
if match.group(4).upper() in ["CR", "C"]:
txn_type = "credit"
# Extract balance if present
balance = None
if len(match.groups()) > 4 and match.group(5):
balance = float(match.group(5).replace(',', ''))
# Extract reference from description
reference = self._extract_reference(description)
transactions.append(PDFTransaction(
date=date,
description=description,
amount=amount,
type=txn_type,
balance=balance,
reference=reference,
))
except (ValueError, IndexError):
continue
return transactions
def _parse_generic(self, text: str) -> List[PDFTransaction]:
"""Generic parsing for unknown bank formats."""
transactions = []
# Generic pattern: date, description, amount
pattern = r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})"
for match in re.finditer(pattern, text, re.MULTILINE):
try:
date = match.group(1)
description = match.group(2).strip()
amount = float(match.group(3).replace(',', ''))
# Infer type from description
txn_type = self._infer_type(description)
reference = self._extract_reference(description)
transactions.append(PDFTransaction(
date=date,
description=description,
amount=amount,
type=txn_type,
reference=reference,
))
except (ValueError, IndexError):
continue
return transactions
def _extract_reference(self, description: str) -> Optional[str]:
"""Extract reference number from description."""
patterns = [
r"[Rr]ef[.:# ]*(\d{10,18})",
r"UTR[.:# ]*(\w{12,22})",
r"IMPS[.:# ]*(\d{12})",
r"NEFT[.:# ]*(\w{10,16})",
]
for pattern in patterns:
match = re.search(pattern, description)
if match:
return match.group(1)
return None
def _infer_type(self, description: str) -> str:
"""Infer transaction type from description."""
description_lower = description.lower()
credit_keywords = ["salary", "credited", "received", "refund", "cashback", "interest"]
debit_keywords = ["debited", "paid", "withdrawn", "transfer to", "payment"]
for kw in credit_keywords:
if kw in description_lower:
return "credit"
for kw in debit_keywords:
if kw in description_lower:
return "debit"
return "debit" # Default to debit
def to_dict_list(self, transactions: List[PDFTransaction]) -> List[Dict]:
"""Convert transactions to list of dictionaries."""
return [
{
"date": t.date,
"description": t.description,
"amount": t.amount,
"type": t.type,
"balance": t.balance,
"reference": t.reference,
}
for t in transactions
]
class ImageOCRParser:
"""
Parse transaction screenshots using OCR.
Uses EasyOCR or pytesseract for text extraction.
"""
def __init__(self, backend: str = "auto"):
"""
Initialize OCR parser.
Args:
backend: "easyocr", "tesseract", or "auto"
"""
self.backend = backend
self.reader = None
self._init_backend()
def _init_backend(self):
"""Initialize OCR backend."""
if self.backend == "auto":
try:
import easyocr
self.reader = easyocr.Reader(['en', 'hi'])
self.backend = "easyocr"
except ImportError:
try:
import pytesseract
self.backend = "tesseract"
except ImportError:
raise ImportError("No OCR backend available. Install easyocr or pytesseract")
elif self.backend == "easyocr":
import easyocr
self.reader = easyocr.Reader(['en', 'hi'])
elif self.backend == "tesseract":
import pytesseract
def extract_text(self, image_path: Path) -> str:
"""
Extract text from image.
Args:
image_path: Path to image file
Returns:
Extracted text
"""
if self.backend == "easyocr":
results = self.reader.readtext(str(image_path))
return "\n".join([r[1] for r in results])
elif self.backend == "tesseract":
import pytesseract
from PIL import Image
image = Image.open(image_path)
return pytesseract.image_to_string(image)
return ""
def extract_text_from_bytes(self, image_bytes: bytes) -> str:
"""
Extract text from image bytes.
Args:
image_bytes: Image content as bytes
Returns:
Extracted text
"""
if self.backend == "easyocr":
import numpy as np
from PIL import Image
image = Image.open(io.BytesIO(image_bytes))
image_array = np.array(image)
results = self.reader.readtext(image_array)
return "\n".join([r[1] for r in results])
elif self.backend == "tesseract":
import pytesseract
from PIL import Image
image = Image.open(io.BytesIO(image_bytes))
return pytesseract.image_to_string(image)
return ""
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================
def parse_pdf(file_path: str) -> List[Dict]:
"""
Convenience function to parse PDF.
Args:
file_path: Path to PDF file
Returns:
List of transaction dictionaries
"""
parser = BankStatementParser()
transactions = parser.parse_file(Path(file_path))
return parser.to_dict_list(transactions)
def parse_image(file_path: str) -> str:
"""
Convenience function to extract text from image.
Args:
file_path: Path to image file
Returns:
Extracted text
"""
parser = ImageOCRParser()
return parser.extract_text(Path(file_path))
# ============================================================================
# MAIN
# ============================================================================
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python pdf_parser.py <file.pdf>")
sys.exit(1)
file_path = sys.argv[1]
if file_path.endswith('.pdf'):
try:
transactions = parse_pdf(file_path)
print(f"Found {len(transactions)} transactions:")
for t in transactions[:10]:
print(f" {t['date']}: {t['type']}{t['amount']:,.2f} - {t['description'][:40]}")
except ImportError as e:
print(f"Error: {e}")
else:
try:
text = parse_image(file_path)
print("Extracted text:")
print(text)
except ImportError as e:
print(f"Error: {e}")