finance-entity-extractor / src /finee /pdf_parser.py

Upload src/finee/pdf_parser.py with huggingface_hub

f60e9c2 verified 18 days ago

12.7 kB

	"""
	PDF Parser for Bank Statements
	==============================

	Extract transactions from Indian bank statement PDFs.

	Supports:
	- HDFC Bank statements
	- ICICI Bank statements
	- SBI Bank statements
	- Axis Bank statements
	- And more...

	Author: Ranjit Behera
	"""

	import re
	from pathlib import Path
	from typing import List, Dict, Optional, Tuple
	from dataclasses import dataclass
	from datetime import datetime
	import io


	@dataclass
	class PDFTransaction:
	"""Parsed transaction from PDF."""
	date: str
	description: str
	amount: float
	type: str # debit or credit
	balance: Optional[float] = None
	reference: Optional[str] = None


	class BankStatementParser:
	"""
	Parse bank statement PDFs and extract transactions.

	Uses pdfplumber for text extraction and regex for parsing.
	"""

	# Bank-specific patterns
	BANK_PATTERNS = {
	"hdfc": {
	"header": r"HDFC\s+BANK",
	"date": r"(\d{2}/\d{2}/\d{2,4})",
	"transaction": r"(\d{2}/\d{2}/\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s([DC]r)?\s([\d,]+\.\d{2})?",
	},
	"icici": {
	"header": r"ICICI\s+BANK",
	"date": r"(\d{2}-\w{3}-\d{2,4})",
	"transaction": r"(\d{2}-\w{3}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s(Dr\|Cr)?\s([\d,]+\.\d{2})?",
	},
	"sbi": {
	"header": r"State\s+Bank\s+of\s+India",
	"date": r"(\d{2}\s+\w{3}\s+\d{2,4})",
	"transaction": r"(\d{2}\s+\w{3}\s+\d{4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
	},
	"axis": {
	"header": r"AXIS\s+BANK",
	"date": r"(\d{2}-\d{2}-\d{2,4})",
	"transaction": r"(\d{2}-\d{2}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
	},
	}

	def __init__(self):
	self.pdfplumber = None
	self._check_dependencies()

	def _check_dependencies(self):
	"""Check if pdfplumber is available."""
	try:
	import pdfplumber
	self.pdfplumber = pdfplumber
	except ImportError:
	self.pdfplumber = None

	def parse_file(self, file_path: Path) -> List[PDFTransaction]:
	"""
	Parse a PDF file and extract transactions.

	Args:
	file_path: Path to PDF file

	Returns:
	List of extracted transactions
	"""
	if self.pdfplumber is None:
	raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")

	with self.pdfplumber.open(file_path) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text() or ""

	return self.parse_text(text)

	def parse_bytes(self, pdf_bytes: bytes) -> List[PDFTransaction]:
	"""
	Parse PDF from bytes.

	Args:
	pdf_bytes: PDF file content as bytes

	Returns:
	List of extracted transactions
	"""
	if self.pdfplumber is None:
	raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")

	with self.pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text() or ""

	return self.parse_text(text)

	def parse_text(self, text: str) -> List[PDFTransaction]:
	"""
	Parse extracted text and identify transactions.

	Args:
	text: Extracted text from PDF

	Returns:
	List of transactions
	"""
	# Detect bank
	bank = self._detect_bank(text)

	if bank:
	return self._parse_with_pattern(text, bank)
	else:
	return self._parse_generic(text)

	def _detect_bank(self, text: str) -> Optional[str]:
	"""Detect which bank's statement this is."""
	text_upper = text.upper()

	for bank, patterns in self.BANK_PATTERNS.items():
	if re.search(patterns["header"], text_upper, re.IGNORECASE):
	return bank

	return None

	def _parse_with_pattern(self, text: str, bank: str) -> List[PDFTransaction]:
	"""Parse using bank-specific pattern."""
	patterns = self.BANK_PATTERNS[bank]
	transactions = []

	for match in re.finditer(patterns["transaction"], text, re.MULTILINE):
	try:
	date = match.group(1)
	description = match.group(2).strip()
	amount = float(match.group(3).replace(',', ''))

	# Determine type
	txn_type = "debit"
	if len(match.groups()) > 3 and match.group(4):
	if match.group(4).upper() in ["CR", "C"]:
	txn_type = "credit"

	# Extract balance if present
	balance = None
	if len(match.groups()) > 4 and match.group(5):
	balance = float(match.group(5).replace(',', ''))

	# Extract reference from description
	reference = self._extract_reference(description)

	transactions.append(PDFTransaction(
	date=date,
	description=description,
	amount=amount,
	type=txn_type,
	balance=balance,
	reference=reference,
	))
	except (ValueError, IndexError):
	continue

	return transactions

	def _parse_generic(self, text: str) -> List[PDFTransaction]:
	"""Generic parsing for unknown bank formats."""
	transactions = []

	# Generic pattern: date, description, amount
	pattern = r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})"

	for match in re.finditer(pattern, text, re.MULTILINE):
	try:
	date = match.group(1)
	description = match.group(2).strip()
	amount = float(match.group(3).replace(',', ''))

	# Infer type from description
	txn_type = self._infer_type(description)
	reference = self._extract_reference(description)

	transactions.append(PDFTransaction(
	date=date,
	description=description,
	amount=amount,
	type=txn_type,
	reference=reference,
	))
	except (ValueError, IndexError):
	continue

	return transactions

	def _extract_reference(self, description: str) -> Optional[str]:
	"""Extract reference number from description."""
	patterns = [
	r"[Rr]ef[.:# ]*(\d{10,18})",
	r"UTR[.:# ]*(\w{12,22})",
	r"IMPS[.:# ]*(\d{12})",
	r"NEFT[.:# ]*(\w{10,16})",
	]

	for pattern in patterns:
	match = re.search(pattern, description)
	if match:
	return match.group(1)

	return None

	def _infer_type(self, description: str) -> str:
	"""Infer transaction type from description."""
	description_lower = description.lower()

	credit_keywords = ["salary", "credited", "received", "refund", "cashback", "interest"]
	debit_keywords = ["debited", "paid", "withdrawn", "transfer to", "payment"]

	for kw in credit_keywords:
	if kw in description_lower:
	return "credit"

	for kw in debit_keywords:
	if kw in description_lower:
	return "debit"

	return "debit" # Default to debit

	def to_dict_list(self, transactions: List[PDFTransaction]) -> List[Dict]:
	"""Convert transactions to list of dictionaries."""
	return [
	{
	"date": t.date,
	"description": t.description,
	"amount": t.amount,
	"type": t.type,
	"balance": t.balance,
	"reference": t.reference,
	}
	for t in transactions
	]


	class ImageOCRParser:
	"""
	Parse transaction screenshots using OCR.

	Uses EasyOCR or pytesseract for text extraction.
	"""

	def __init__(self, backend: str = "auto"):
	"""
	Initialize OCR parser.

	Args:
	backend: "easyocr", "tesseract", or "auto"
	"""
	self.backend = backend
	self.reader = None
	self._init_backend()

	def _init_backend(self):
	"""Initialize OCR backend."""
	if self.backend == "auto":
	try:
	import easyocr
	self.reader = easyocr.Reader(['en', 'hi'])
	self.backend = "easyocr"
	except ImportError:
	try:
	import pytesseract
	self.backend = "tesseract"
	except ImportError:
	raise ImportError("No OCR backend available. Install easyocr or pytesseract")

	elif self.backend == "easyocr":
	import easyocr
	self.reader = easyocr.Reader(['en', 'hi'])

	elif self.backend == "tesseract":
	import pytesseract

	def extract_text(self, image_path: Path) -> str:
	"""
	Extract text from image.

	Args:
	image_path: Path to image file

	Returns:
	Extracted text
	"""
	if self.backend == "easyocr":
	results = self.reader.readtext(str(image_path))
	return "\n".join([r[1] for r in results])

	elif self.backend == "tesseract":
	import pytesseract
	from PIL import Image

	image = Image.open(image_path)
	return pytesseract.image_to_string(image)

	return ""

	def extract_text_from_bytes(self, image_bytes: bytes) -> str:
	"""
	Extract text from image bytes.

	Args:
	image_bytes: Image content as bytes

	Returns:
	Extracted text
	"""
	if self.backend == "easyocr":
	import numpy as np
	from PIL import Image

	image = Image.open(io.BytesIO(image_bytes))
	image_array = np.array(image)
	results = self.reader.readtext(image_array)
	return "\n".join([r[1] for r in results])

	elif self.backend == "tesseract":
	import pytesseract
	from PIL import Image

	image = Image.open(io.BytesIO(image_bytes))
	return pytesseract.image_to_string(image)

	return ""


	# ============================================================================
	# UTILITY FUNCTIONS
	# ============================================================================

	def parse_pdf(file_path: str) -> List[Dict]:
	"""
	Convenience function to parse PDF.

	Args:
	file_path: Path to PDF file

	Returns:
	List of transaction dictionaries
	"""
	parser = BankStatementParser()
	transactions = parser.parse_file(Path(file_path))
	return parser.to_dict_list(transactions)


	def parse_image(file_path: str) -> str:
	"""
	Convenience function to extract text from image.

	Args:
	file_path: Path to image file

	Returns:
	Extracted text
	"""
	parser = ImageOCRParser()
	return parser.extract_text(Path(file_path))


	# ============================================================================
	# MAIN
	# ============================================================================

	if __name__ == "__main__":
	import sys

	if len(sys.argv) < 2:
	print("Usage: python pdf_parser.py <file.pdf>")
	sys.exit(1)

	file_path = sys.argv[1]

	if file_path.endswith('.pdf'):
	try:
	transactions = parse_pdf(file_path)
	print(f"Found {len(transactions)} transactions:")
	for t in transactions[:10]:
	print(f" {t['date']}: {t['type']} ₹{t['amount']:,.2f} - {t['description'][:40]}")
	except ImportError as e:
	print(f"Error: {e}")
	else:
	try:
	text = parse_image(file_path)
	print("Extracted text:")
	print(text)
	except ImportError as e:
	print(f"Error: {e}")