Ranjit Behera
FinEE v1.0 - Finance Entity Extractor
dcc24f8
"""
Financial Entity Extractor Module.
This module provides enterprise-grade extraction of financial entities from
transaction emails across multiple Indian banks and payment platforms.
Supported Banks:
- HDFC Bank
- ICICI Bank
- State Bank of India (SBI)
- Axis Bank
- Kotak Mahindra Bank
Supported Payment Platforms:
- PhonePe
- Google Pay (GPay)
- Paytm
- BHIM UPI
Example:
>>> from src.data.extractor import EntityExtractor
>>> extractor = EntityExtractor()
>>> result = extractor.extract("Rs.2500 debited from A/c 3545 on 05-01-26")
>>> print(result.to_dict())
{'amount': '2500', 'type': 'debit', 'account': '3545', 'date': '05-01-26'}
Author: Ranjit Behera
License: MIT
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field, asdict
from typing import (
Dict,
List,
Optional,
Pattern,
Tuple,
Any,
ClassVar,
)
# Configure module logger
logger = logging.getLogger(__name__)
@dataclass
class FinancialEntity:
"""
Represents extracted financial entities from a transaction message.
This dataclass holds all extracted fields from a financial transaction
notification, including amount, type, account details, and optional
merchant/category information.
Attributes:
amount: Transaction amount as string (preserves decimal precision).
type: Transaction type - 'debit' or 'credit'.
account: Account number (usually last 4 digits, masked).
date: Transaction date in original format from message.
reference: UPI/IMPS/NEFT reference number.
merchant: Identified merchant name (e.g., 'swiggy', 'amazon').
payment_method: Payment method - 'upi', 'neft', 'imps', 'card'.
category: Transaction category - 'food', 'shopping', 'transport', etc.
bank: Source bank name if identified.
balance: Available balance after transaction.
raw_text: Original text used for extraction (for debugging).
Example:
>>> entity = FinancialEntity(
... amount="2500.00",
... type="debit",
... account="3545",
... date="05-01-26",
... merchant="swiggy",
... category="food"
... )
>>> entity.is_valid()
True
>>> entity.to_dict()
{'amount': '2500.00', 'type': 'debit', ...}
"""
amount: Optional[str] = None
type: Optional[str] = None
account: Optional[str] = None
date: Optional[str] = None
reference: Optional[str] = None
merchant: Optional[str] = None
payment_method: Optional[str] = None
category: Optional[str] = None
bank: Optional[str] = None
balance: Optional[str] = None
raw_text: str = field(default="", repr=False)
# Validation constants
VALID_TYPES: ClassVar[set] = {"debit", "credit"}
VALID_PAYMENT_METHODS: ClassVar[set] = {"upi", "neft", "imps", "rtgs", "card", "wallet"}
VALID_CATEGORIES: ClassVar[set] = {
"food", "shopping", "transport", "bills", "grocery",
"entertainment", "health", "education", "transfer", "other"
}
def __post_init__(self) -> None:
"""Validate and normalize fields after initialization."""
# Normalize type to lowercase
if self.type:
self.type = self.type.lower()
# Normalize payment method
if self.payment_method:
self.payment_method = self.payment_method.lower()
# Normalize category
if self.category:
self.category = self.category.lower()
def is_valid(self) -> bool:
"""
Check if the entity has minimum required fields.
A valid entity must have at least an amount and transaction type.
Returns:
bool: True if entity has minimum required fields.
Example:
>>> entity = FinancialEntity(amount="100", type="debit")
>>> entity.is_valid()
True
>>> entity = FinancialEntity(amount="100")
>>> entity.is_valid()
False
"""
return bool(self.amount and self.type)
def is_complete(self) -> bool:
"""
Check if the entity has all core fields populated.
A complete entity has amount, type, account, and date.
Returns:
bool: True if entity has all core fields.
"""
return bool(
self.amount and
self.type and
self.account and
self.date
)
def to_dict(self) -> Dict[str, Any]:
"""
Convert entity to dictionary, excluding None values.
Returns:
Dict[str, Any]: Dictionary with non-None fields only.
Example:
>>> entity = FinancialEntity(amount="500", type="debit")
>>> entity.to_dict()
{'amount': '500', 'type': 'debit'}
"""
return {
key: value
for key, value in asdict(self).items()
if value is not None and key != "raw_text"
}
def to_json_string(self) -> str:
"""
Convert entity to JSON string for model training.
Returns:
str: JSON representation of the entity.
"""
import json
return json.dumps(self.to_dict(), indent=2)
def confidence_score(self) -> float:
"""
Calculate confidence score based on populated fields.
Returns:
float: Score between 0.0 and 1.0.
"""
required_fields = ["amount", "type"]
optional_fields = ["account", "date", "reference", "merchant", "category"]
required_score = sum(
1 for f in required_fields
if getattr(self, f) is not None
) / len(required_fields)
optional_score = sum(
1 for f in optional_fields
if getattr(self, f) is not None
) / len(optional_fields)
return required_score * 0.6 + optional_score * 0.4
class EntityExtractor:
"""
Production-grade financial entity extractor using regex patterns.
This extractor uses a comprehensive set of regex patterns to extract
structured financial data from transaction notifications. It supports
multiple Indian banks and payment platforms with high accuracy.
Features:
- Amount extraction (Rs., ₹, INR formats)
- Transaction type detection (debit/credit)
- Account number extraction (masked formats)
- Date parsing (multiple formats)
- Reference number extraction
- Merchant identification
- Payment method detection
- Category classification
Attributes:
AMOUNT_PATTERNS: Compiled regex patterns for amount extraction.
DEBIT_KEYWORDS: Keywords indicating debit transactions.
CREDIT_KEYWORDS: Keywords indicating credit transactions.
MERCHANTS: Merchant name to keyword mapping.
CATEGORIES: Category to merchant mapping.
Example:
>>> extractor = EntityExtractor()
>>> result = extractor.extract(
... "HDFC Bank: Rs.2500.00 debited from A/c **3545 on 05-01-26"
... )
>>> print(result.amount)
'2500.00'
>>> print(result.merchant)
None # Would need VPA for merchant detection
Note:
For best results, pass the complete transaction message including
sender information and subject line when available.
"""
# Amount extraction patterns (ordered by specificity)
AMOUNT_PATTERNS: ClassVar[List[Pattern]] = [
re.compile(r'(?:Rs\.?|INR|₹)\s*([\d,]+(?:\.\d{2})?)', re.IGNORECASE),
re.compile(r'(?:amount|amt)[:\s]*([\d,]+(?:\.\d{2})?)', re.IGNORECASE),
re.compile(r'(?:debited|credited)[:\s]*(?:Rs\.?|INR|₹)?\s*([\d,]+(?:\.\d{2})?)', re.IGNORECASE),
]
# Transaction type keywords
DEBIT_KEYWORDS: ClassVar[set] = {
'debited', 'debit', 'paid', 'sent', 'withdrawn',
'purchase', 'payment', 'transferred out', 'dr'
}
CREDIT_KEYWORDS: ClassVar[set] = {
'credited', 'credit', 'received', 'deposited',
'refund', 'cashback', 'transferred in', 'cr'
}
# Account extraction patterns
ACCOUNT_PATTERNS: ClassVar[List[Pattern]] = [
re.compile(r'(?:a/c|acct?|account)\s*(?:no\.?)?\s*[:\s]*\**[xX]*(\d{4,})', re.IGNORECASE),
re.compile(r'[*xX]+(\d{4})\b'),
re.compile(r'(?:XX|xx)(\d{4})\b'),
]
# Date extraction patterns
DATE_PATTERNS: ClassVar[List[Pattern]] = [
re.compile(r'(\d{2}[-/]\d{2}[-/]\d{2,4})'),
re.compile(r'(\d{2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4})', re.IGNORECASE),
re.compile(r'(\d{4}[-/]\d{2}[-/]\d{2})'),
]
# Reference number patterns
REFERENCE_PATTERNS: ClassVar[List[Pattern]] = [
re.compile(r'(?:ref(?:erence)?|txn|upi|imps)\s*(?:no\.?|id)?[:\s]*(\d{10,})', re.IGNORECASE),
re.compile(r'(?:transaction)\s*(?:id)?[:\s]*(\d{10,})', re.IGNORECASE),
]
# Merchant identification
MERCHANTS: ClassVar[Dict[str, List[str]]] = {
# Food delivery
'swiggy': ['swiggy', 'swiggy@'],
'zomato': ['zomato', 'zomato@'],
'dominos': ['dominos', 'domino'],
'mcdonalds': ['mcdonald', 'mcd@'],
'kfc': ['kfc@', 'kfc '],
'starbucks': ['starbucks', 'sbux'],
# E-commerce
'amazon': ['amazon', 'amzn', 'amazon@'],
'flipkart': ['flipkart', 'fkrt'],
'myntra': ['myntra'],
'ajio': ['ajio'],
'nykaa': ['nykaa'],
# Transport
'uber': ['uber'],
'ola': ['ola@', 'olacabs'],
'rapido': ['rapido'],
'metro': ['metro', 'dmrc', 'bmrc'],
# Bills & Utilities
'airtel': ['airtel'],
'jio': ['jio@', 'reliancejio'],
'vodafone': ['vodafone', 'vi@'],
'electricity': ['bescom', 'electricity', 'power'],
'gas': ['indane', 'bharat gas', 'hp gas'],
# Grocery
'bigbasket': ['bigbasket', 'bb@'],
'zepto': ['zepto'],
'blinkit': ['blinkit', 'grofers'],
'dmart': ['dmart', 'd-mart'],
}
# Category mapping
CATEGORY_KEYWORDS: ClassVar[Dict[str, List[str]]] = {
'food': ['swiggy', 'zomato', 'dominos', 'mcdonald', 'kfc', 'restaurant', 'cafe', 'food'],
'shopping': ['amazon', 'flipkart', 'myntra', 'ajio', 'nykaa', 'shopping'],
'transport': ['uber', 'ola', 'rapido', 'metro', 'cab', 'taxi', 'fuel', 'petrol'],
'bills': ['airtel', 'jio', 'vodafone', 'electricity', 'water', 'gas', 'broadband'],
'grocery': ['bigbasket', 'zepto', 'blinkit', 'dmart', 'grocery', 'supermarket'],
'entertainment': ['netflix', 'prime', 'hotstar', 'spotify', 'movie', 'theatre'],
'health': ['pharmacy', 'medical', 'hospital', 'doctor', 'health'],
'education': ['school', 'college', 'course', 'udemy', 'education'],
}
# Payment method patterns
PAYMENT_PATTERNS: ClassVar[Dict[str, List[str]]] = {
'upi': ['upi', 'vpa', '@ybl', '@oksbi', '@okicici', '@paytm', '@axisbank', '@icici'],
'neft': ['neft'],
'imps': ['imps'],
'rtgs': ['rtgs'],
'card': ['card', 'visa', 'mastercard', 'rupay', 'credit card', 'debit card'],
'wallet': ['wallet', 'paytm wallet', 'phonepe wallet'],
}
# Bank identification
BANK_PATTERNS: ClassVar[Dict[str, List[str]]] = {
'hdfc': ['hdfc', 'hdfcbank'],
'icici': ['icici'],
'sbi': ['sbi', 'state bank'],
'axis': ['axis'],
'kotak': ['kotak'],
'pnb': ['pnb', 'punjab national'],
'bob': ['bob', 'bank of baroda'],
'canara': ['canara'],
'union': ['union bank'],
'idbi': ['idbi'],
}
def __init__(self, debug: bool = False) -> None:
"""
Initialize the EntityExtractor.
Args:
debug: If True, enables debug logging.
Example:
>>> extractor = EntityExtractor(debug=True)
"""
self.debug = debug
if debug:
logger.setLevel(logging.DEBUG)
logger.info("EntityExtractor initialized")
def extract(self, text: str) -> FinancialEntity:
"""
Extract financial entities from transaction text.
This is the main entry point for entity extraction. It processes
the input text and returns a FinancialEntity object with all
detected fields populated.
Args:
text: The transaction message text to process.
Returns:
FinancialEntity: Extracted entity with populated fields.
Example:
>>> extractor = EntityExtractor()
>>> result = extractor.extract(
... "Rs.2500 debited from A/c 3545 to swiggy@ybl on 05-01-26"
... )
>>> print(result.amount)
'2500'
>>> print(result.merchant)
'swiggy'
Note:
The method never raises exceptions. On failure, it returns
an entity with None fields and logs the error.
"""
if not text or not isinstance(text, str):
logger.warning("Empty or invalid text provided")
return FinancialEntity(raw_text=str(text) if text else "")
text_lower = text.lower()
try:
entity = FinancialEntity(
amount=self._extract_amount(text),
type=self._extract_type(text_lower),
account=self._extract_account(text),
date=self._extract_date(text),
reference=self._extract_reference(text),
merchant=self._extract_merchant(text_lower),
payment_method=self._extract_payment_method(text_lower),
category=self._extract_category(text_lower),
bank=self._extract_bank(text_lower),
balance=self._extract_balance(text),
raw_text=text,
)
logger.debug(f"Extracted entity: {entity.to_dict()}")
return entity
except Exception as e:
logger.error(f"Extraction failed: {e}", exc_info=True)
return FinancialEntity(raw_text=text)
def extract_to_dict(self, text: str) -> Dict[str, Any]:
"""
Extract entities and return as dictionary.
Convenience method that extracts and converts to dict in one call.
Args:
text: The transaction message text.
Returns:
Dict[str, Any]: Dictionary of extracted entities.
"""
return self.extract(text).to_dict()
def _extract_amount(self, text: str) -> Optional[str]:
"""Extract transaction amount from text."""
for pattern in self.AMOUNT_PATTERNS:
match = pattern.search(text)
if match:
amount = match.group(1).replace(',', '')
logger.debug(f"Found amount: {amount}")
return amount
return None
def _extract_type(self, text_lower: str) -> Optional[str]:
"""Determine if transaction is debit or credit."""
# Check debit keywords first (more common)
for keyword in self.DEBIT_KEYWORDS:
if keyword in text_lower:
return 'debit'
# Then check credit keywords
for keyword in self.CREDIT_KEYWORDS:
if keyword in text_lower:
return 'credit'
return None
def _extract_account(self, text: str) -> Optional[str]:
"""Extract account number from text."""
for pattern in self.ACCOUNT_PATTERNS:
match = pattern.search(text)
if match:
account = match.group(1)
# Return last 4 digits only
return account[-4:] if len(account) > 4 else account
return None
def _extract_date(self, text: str) -> Optional[str]:
"""Extract transaction date from text."""
for pattern in self.DATE_PATTERNS:
match = pattern.search(text)
if match:
return match.group(1)
return None
def _extract_reference(self, text: str) -> Optional[str]:
"""Extract reference/transaction number."""
for pattern in self.REFERENCE_PATTERNS:
match = pattern.search(text)
if match:
return match.group(1)
return None
def _extract_merchant(self, text_lower: str) -> Optional[str]:
"""Identify merchant from text."""
for merchant, keywords in self.MERCHANTS.items():
for keyword in keywords:
if keyword in text_lower:
return merchant
return None
def _extract_payment_method(self, text_lower: str) -> Optional[str]:
"""Detect payment method."""
for method, keywords in self.PAYMENT_PATTERNS.items():
for keyword in keywords:
if keyword in text_lower:
return method
return None
def _extract_category(self, text_lower: str) -> Optional[str]:
"""Classify transaction category."""
for category, keywords in self.CATEGORY_KEYWORDS.items():
for keyword in keywords:
if keyword in text_lower:
return category
return None
def _extract_bank(self, text_lower: str) -> Optional[str]:
"""Identify source bank."""
for bank, keywords in self.BANK_PATTERNS.items():
for keyword in keywords:
if keyword in text_lower:
return bank
return None
def _extract_balance(self, text: str) -> Optional[str]:
"""Extract available balance if mentioned."""
patterns = [
re.compile(r'(?:bal(?:ance)?|avl\.?\s*bal)[:\s]*(?:Rs\.?|INR|₹)?\s*([\d,]+(?:\.\d{2})?)', re.IGNORECASE),
]
for pattern in patterns:
match = pattern.search(text)
if match:
return match.group(1).replace(',', '')
return None
# Module-level convenience function
def extract_entities(text: str) -> Dict[str, Any]:
"""
Convenience function to extract entities without instantiating class.
Args:
text: Transaction message text.
Returns:
Dict[str, Any]: Extracted entities as dictionary.
Example:
>>> from src.data.extractor import extract_entities
>>> result = extract_entities("Rs.500 debited from account 1234")
>>> print(result['amount'])
'500'
"""
return EntityExtractor().extract_to_dict(text)
if __name__ == "__main__":
# Self-test when run directly
logging.basicConfig(level=logging.DEBUG)
extractor = EntityExtractor(debug=True)
test_cases = [
"HDFC Bank: Rs.2500.00 debited from A/c **3545 on 05-01-26 to VPA swiggy@ybl. Ref: 123456789012",
"Dear Customer, INR 45000 credited to A/c 7890 on 04-01-2026. Salary from ACME Corp.",
"You paid Rs.599 to Amazon from HDFC Bank a/c XX4567. UPI Ref: 987654321012",
]
for i, test in enumerate(test_cases, 1):
print(f"\n{'='*60}")
print(f"Test {i}: {test[:50]}...")
result = extractor.extract(test)
print(f"Result: {result.to_dict()}")
print(f"Valid: {result.is_valid()}, Confidence: {result.confidence_score():.2%}")