|
|
""" |
|
|
Financial Entity Extractor Module. |
|
|
|
|
|
This module provides enterprise-grade extraction of financial entities from |
|
|
transaction emails across multiple Indian banks and payment platforms. |
|
|
|
|
|
Supported Banks: |
|
|
- HDFC Bank |
|
|
- ICICI Bank |
|
|
- State Bank of India (SBI) |
|
|
- Axis Bank |
|
|
- Kotak Mahindra Bank |
|
|
|
|
|
Supported Payment Platforms: |
|
|
- PhonePe |
|
|
- Google Pay (GPay) |
|
|
- Paytm |
|
|
- BHIM UPI |
|
|
|
|
|
Example: |
|
|
>>> from src.data.extractor import EntityExtractor |
|
|
>>> extractor = EntityExtractor() |
|
|
>>> result = extractor.extract("Rs.2500 debited from A/c 3545 on 05-01-26") |
|
|
>>> print(result.to_dict()) |
|
|
{'amount': '2500', 'type': 'debit', 'account': '3545', 'date': '05-01-26'} |
|
|
|
|
|
Author: Ranjit Behera |
|
|
License: MIT |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import logging |
|
|
import re |
|
|
from dataclasses import dataclass, field, asdict |
|
|
from typing import ( |
|
|
Dict, |
|
|
List, |
|
|
Optional, |
|
|
Pattern, |
|
|
Tuple, |
|
|
Any, |
|
|
ClassVar, |
|
|
) |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class FinancialEntity: |
|
|
""" |
|
|
Represents extracted financial entities from a transaction message. |
|
|
|
|
|
This dataclass holds all extracted fields from a financial transaction |
|
|
notification, including amount, type, account details, and optional |
|
|
merchant/category information. |
|
|
|
|
|
Attributes: |
|
|
amount: Transaction amount as string (preserves decimal precision). |
|
|
type: Transaction type - 'debit' or 'credit'. |
|
|
account: Account number (usually last 4 digits, masked). |
|
|
date: Transaction date in original format from message. |
|
|
reference: UPI/IMPS/NEFT reference number. |
|
|
merchant: Identified merchant name (e.g., 'swiggy', 'amazon'). |
|
|
payment_method: Payment method - 'upi', 'neft', 'imps', 'card'. |
|
|
category: Transaction category - 'food', 'shopping', 'transport', etc. |
|
|
bank: Source bank name if identified. |
|
|
balance: Available balance after transaction. |
|
|
raw_text: Original text used for extraction (for debugging). |
|
|
|
|
|
Example: |
|
|
>>> entity = FinancialEntity( |
|
|
... amount="2500.00", |
|
|
... type="debit", |
|
|
... account="3545", |
|
|
... date="05-01-26", |
|
|
... merchant="swiggy", |
|
|
... category="food" |
|
|
... ) |
|
|
>>> entity.is_valid() |
|
|
True |
|
|
>>> entity.to_dict() |
|
|
{'amount': '2500.00', 'type': 'debit', ...} |
|
|
""" |
|
|
|
|
|
amount: Optional[str] = None |
|
|
type: Optional[str] = None |
|
|
account: Optional[str] = None |
|
|
date: Optional[str] = None |
|
|
reference: Optional[str] = None |
|
|
merchant: Optional[str] = None |
|
|
payment_method: Optional[str] = None |
|
|
category: Optional[str] = None |
|
|
bank: Optional[str] = None |
|
|
balance: Optional[str] = None |
|
|
raw_text: str = field(default="", repr=False) |
|
|
|
|
|
|
|
|
VALID_TYPES: ClassVar[set] = {"debit", "credit"} |
|
|
VALID_PAYMENT_METHODS: ClassVar[set] = {"upi", "neft", "imps", "rtgs", "card", "wallet"} |
|
|
VALID_CATEGORIES: ClassVar[set] = { |
|
|
"food", "shopping", "transport", "bills", "grocery", |
|
|
"entertainment", "health", "education", "transfer", "other" |
|
|
} |
|
|
|
|
|
def __post_init__(self) -> None: |
|
|
"""Validate and normalize fields after initialization.""" |
|
|
|
|
|
if self.type: |
|
|
self.type = self.type.lower() |
|
|
|
|
|
|
|
|
if self.payment_method: |
|
|
self.payment_method = self.payment_method.lower() |
|
|
|
|
|
|
|
|
if self.category: |
|
|
self.category = self.category.lower() |
|
|
|
|
|
def is_valid(self) -> bool: |
|
|
""" |
|
|
Check if the entity has minimum required fields. |
|
|
|
|
|
A valid entity must have at least an amount and transaction type. |
|
|
|
|
|
Returns: |
|
|
bool: True if entity has minimum required fields. |
|
|
|
|
|
Example: |
|
|
>>> entity = FinancialEntity(amount="100", type="debit") |
|
|
>>> entity.is_valid() |
|
|
True |
|
|
>>> entity = FinancialEntity(amount="100") |
|
|
>>> entity.is_valid() |
|
|
False |
|
|
""" |
|
|
return bool(self.amount and self.type) |
|
|
|
|
|
def is_complete(self) -> bool: |
|
|
""" |
|
|
Check if the entity has all core fields populated. |
|
|
|
|
|
A complete entity has amount, type, account, and date. |
|
|
|
|
|
Returns: |
|
|
bool: True if entity has all core fields. |
|
|
""" |
|
|
return bool( |
|
|
self.amount and |
|
|
self.type and |
|
|
self.account and |
|
|
self.date |
|
|
) |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
""" |
|
|
Convert entity to dictionary, excluding None values. |
|
|
|
|
|
Returns: |
|
|
Dict[str, Any]: Dictionary with non-None fields only. |
|
|
|
|
|
Example: |
|
|
>>> entity = FinancialEntity(amount="500", type="debit") |
|
|
>>> entity.to_dict() |
|
|
{'amount': '500', 'type': 'debit'} |
|
|
""" |
|
|
return { |
|
|
key: value |
|
|
for key, value in asdict(self).items() |
|
|
if value is not None and key != "raw_text" |
|
|
} |
|
|
|
|
|
def to_json_string(self) -> str: |
|
|
""" |
|
|
Convert entity to JSON string for model training. |
|
|
|
|
|
Returns: |
|
|
str: JSON representation of the entity. |
|
|
""" |
|
|
import json |
|
|
return json.dumps(self.to_dict(), indent=2) |
|
|
|
|
|
def confidence_score(self) -> float: |
|
|
""" |
|
|
Calculate confidence score based on populated fields. |
|
|
|
|
|
Returns: |
|
|
float: Score between 0.0 and 1.0. |
|
|
""" |
|
|
required_fields = ["amount", "type"] |
|
|
optional_fields = ["account", "date", "reference", "merchant", "category"] |
|
|
|
|
|
required_score = sum( |
|
|
1 for f in required_fields |
|
|
if getattr(self, f) is not None |
|
|
) / len(required_fields) |
|
|
|
|
|
optional_score = sum( |
|
|
1 for f in optional_fields |
|
|
if getattr(self, f) is not None |
|
|
) / len(optional_fields) |
|
|
|
|
|
return required_score * 0.6 + optional_score * 0.4 |
|
|
|
|
|
|
|
|
class EntityExtractor: |
|
|
""" |
|
|
Production-grade financial entity extractor using regex patterns. |
|
|
|
|
|
This extractor uses a comprehensive set of regex patterns to extract |
|
|
structured financial data from transaction notifications. It supports |
|
|
multiple Indian banks and payment platforms with high accuracy. |
|
|
|
|
|
Features: |
|
|
- Amount extraction (Rs., ₹, INR formats) |
|
|
- Transaction type detection (debit/credit) |
|
|
- Account number extraction (masked formats) |
|
|
- Date parsing (multiple formats) |
|
|
- Reference number extraction |
|
|
- Merchant identification |
|
|
- Payment method detection |
|
|
- Category classification |
|
|
|
|
|
Attributes: |
|
|
AMOUNT_PATTERNS: Compiled regex patterns for amount extraction. |
|
|
DEBIT_KEYWORDS: Keywords indicating debit transactions. |
|
|
CREDIT_KEYWORDS: Keywords indicating credit transactions. |
|
|
MERCHANTS: Merchant name to keyword mapping. |
|
|
CATEGORIES: Category to merchant mapping. |
|
|
|
|
|
Example: |
|
|
>>> extractor = EntityExtractor() |
|
|
>>> result = extractor.extract( |
|
|
... "HDFC Bank: Rs.2500.00 debited from A/c **3545 on 05-01-26" |
|
|
... ) |
|
|
>>> print(result.amount) |
|
|
'2500.00' |
|
|
>>> print(result.merchant) |
|
|
None # Would need VPA for merchant detection |
|
|
|
|
|
Note: |
|
|
For best results, pass the complete transaction message including |
|
|
sender information and subject line when available. |
|
|
""" |
|
|
|
|
|
|
|
|
AMOUNT_PATTERNS: ClassVar[List[Pattern]] = [ |
|
|
re.compile(r'(?:Rs\.?|INR|₹)\s*([\d,]+(?:\.\d{2})?)', re.IGNORECASE), |
|
|
re.compile(r'(?:amount|amt)[:\s]*([\d,]+(?:\.\d{2})?)', re.IGNORECASE), |
|
|
re.compile(r'(?:debited|credited)[:\s]*(?:Rs\.?|INR|₹)?\s*([\d,]+(?:\.\d{2})?)', re.IGNORECASE), |
|
|
] |
|
|
|
|
|
|
|
|
DEBIT_KEYWORDS: ClassVar[set] = { |
|
|
'debited', 'debit', 'paid', 'sent', 'withdrawn', |
|
|
'purchase', 'payment', 'transferred out', 'dr' |
|
|
} |
|
|
|
|
|
CREDIT_KEYWORDS: ClassVar[set] = { |
|
|
'credited', 'credit', 'received', 'deposited', |
|
|
'refund', 'cashback', 'transferred in', 'cr' |
|
|
} |
|
|
|
|
|
|
|
|
ACCOUNT_PATTERNS: ClassVar[List[Pattern]] = [ |
|
|
re.compile(r'(?:a/c|acct?|account)\s*(?:no\.?)?\s*[:\s]*\**[xX]*(\d{4,})', re.IGNORECASE), |
|
|
re.compile(r'[*xX]+(\d{4})\b'), |
|
|
re.compile(r'(?:XX|xx)(\d{4})\b'), |
|
|
] |
|
|
|
|
|
|
|
|
DATE_PATTERNS: ClassVar[List[Pattern]] = [ |
|
|
re.compile(r'(\d{2}[-/]\d{2}[-/]\d{2,4})'), |
|
|
re.compile(r'(\d{2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4})', re.IGNORECASE), |
|
|
re.compile(r'(\d{4}[-/]\d{2}[-/]\d{2})'), |
|
|
] |
|
|
|
|
|
|
|
|
REFERENCE_PATTERNS: ClassVar[List[Pattern]] = [ |
|
|
re.compile(r'(?:ref(?:erence)?|txn|upi|imps)\s*(?:no\.?|id)?[:\s]*(\d{10,})', re.IGNORECASE), |
|
|
re.compile(r'(?:transaction)\s*(?:id)?[:\s]*(\d{10,})', re.IGNORECASE), |
|
|
] |
|
|
|
|
|
|
|
|
MERCHANTS: ClassVar[Dict[str, List[str]]] = { |
|
|
|
|
|
'swiggy': ['swiggy', 'swiggy@'], |
|
|
'zomato': ['zomato', 'zomato@'], |
|
|
'dominos': ['dominos', 'domino'], |
|
|
'mcdonalds': ['mcdonald', 'mcd@'], |
|
|
'kfc': ['kfc@', 'kfc '], |
|
|
'starbucks': ['starbucks', 'sbux'], |
|
|
|
|
|
|
|
|
'amazon': ['amazon', 'amzn', 'amazon@'], |
|
|
'flipkart': ['flipkart', 'fkrt'], |
|
|
'myntra': ['myntra'], |
|
|
'ajio': ['ajio'], |
|
|
'nykaa': ['nykaa'], |
|
|
|
|
|
|
|
|
'uber': ['uber'], |
|
|
'ola': ['ola@', 'olacabs'], |
|
|
'rapido': ['rapido'], |
|
|
'metro': ['metro', 'dmrc', 'bmrc'], |
|
|
|
|
|
|
|
|
'airtel': ['airtel'], |
|
|
'jio': ['jio@', 'reliancejio'], |
|
|
'vodafone': ['vodafone', 'vi@'], |
|
|
'electricity': ['bescom', 'electricity', 'power'], |
|
|
'gas': ['indane', 'bharat gas', 'hp gas'], |
|
|
|
|
|
|
|
|
'bigbasket': ['bigbasket', 'bb@'], |
|
|
'zepto': ['zepto'], |
|
|
'blinkit': ['blinkit', 'grofers'], |
|
|
'dmart': ['dmart', 'd-mart'], |
|
|
} |
|
|
|
|
|
|
|
|
CATEGORY_KEYWORDS: ClassVar[Dict[str, List[str]]] = { |
|
|
'food': ['swiggy', 'zomato', 'dominos', 'mcdonald', 'kfc', 'restaurant', 'cafe', 'food'], |
|
|
'shopping': ['amazon', 'flipkart', 'myntra', 'ajio', 'nykaa', 'shopping'], |
|
|
'transport': ['uber', 'ola', 'rapido', 'metro', 'cab', 'taxi', 'fuel', 'petrol'], |
|
|
'bills': ['airtel', 'jio', 'vodafone', 'electricity', 'water', 'gas', 'broadband'], |
|
|
'grocery': ['bigbasket', 'zepto', 'blinkit', 'dmart', 'grocery', 'supermarket'], |
|
|
'entertainment': ['netflix', 'prime', 'hotstar', 'spotify', 'movie', 'theatre'], |
|
|
'health': ['pharmacy', 'medical', 'hospital', 'doctor', 'health'], |
|
|
'education': ['school', 'college', 'course', 'udemy', 'education'], |
|
|
} |
|
|
|
|
|
|
|
|
PAYMENT_PATTERNS: ClassVar[Dict[str, List[str]]] = { |
|
|
'upi': ['upi', 'vpa', '@ybl', '@oksbi', '@okicici', '@paytm', '@axisbank', '@icici'], |
|
|
'neft': ['neft'], |
|
|
'imps': ['imps'], |
|
|
'rtgs': ['rtgs'], |
|
|
'card': ['card', 'visa', 'mastercard', 'rupay', 'credit card', 'debit card'], |
|
|
'wallet': ['wallet', 'paytm wallet', 'phonepe wallet'], |
|
|
} |
|
|
|
|
|
|
|
|
BANK_PATTERNS: ClassVar[Dict[str, List[str]]] = { |
|
|
'hdfc': ['hdfc', 'hdfcbank'], |
|
|
'icici': ['icici'], |
|
|
'sbi': ['sbi', 'state bank'], |
|
|
'axis': ['axis'], |
|
|
'kotak': ['kotak'], |
|
|
'pnb': ['pnb', 'punjab national'], |
|
|
'bob': ['bob', 'bank of baroda'], |
|
|
'canara': ['canara'], |
|
|
'union': ['union bank'], |
|
|
'idbi': ['idbi'], |
|
|
} |
|
|
|
|
|
def __init__(self, debug: bool = False) -> None: |
|
|
""" |
|
|
Initialize the EntityExtractor. |
|
|
|
|
|
Args: |
|
|
debug: If True, enables debug logging. |
|
|
|
|
|
Example: |
|
|
>>> extractor = EntityExtractor(debug=True) |
|
|
""" |
|
|
self.debug = debug |
|
|
if debug: |
|
|
logger.setLevel(logging.DEBUG) |
|
|
|
|
|
logger.info("EntityExtractor initialized") |
|
|
|
|
|
def extract(self, text: str) -> FinancialEntity: |
|
|
""" |
|
|
Extract financial entities from transaction text. |
|
|
|
|
|
This is the main entry point for entity extraction. It processes |
|
|
the input text and returns a FinancialEntity object with all |
|
|
detected fields populated. |
|
|
|
|
|
Args: |
|
|
text: The transaction message text to process. |
|
|
|
|
|
Returns: |
|
|
FinancialEntity: Extracted entity with populated fields. |
|
|
|
|
|
Example: |
|
|
>>> extractor = EntityExtractor() |
|
|
>>> result = extractor.extract( |
|
|
... "Rs.2500 debited from A/c 3545 to swiggy@ybl on 05-01-26" |
|
|
... ) |
|
|
>>> print(result.amount) |
|
|
'2500' |
|
|
>>> print(result.merchant) |
|
|
'swiggy' |
|
|
|
|
|
Note: |
|
|
The method never raises exceptions. On failure, it returns |
|
|
an entity with None fields and logs the error. |
|
|
""" |
|
|
if not text or not isinstance(text, str): |
|
|
logger.warning("Empty or invalid text provided") |
|
|
return FinancialEntity(raw_text=str(text) if text else "") |
|
|
|
|
|
text_lower = text.lower() |
|
|
|
|
|
try: |
|
|
entity = FinancialEntity( |
|
|
amount=self._extract_amount(text), |
|
|
type=self._extract_type(text_lower), |
|
|
account=self._extract_account(text), |
|
|
date=self._extract_date(text), |
|
|
reference=self._extract_reference(text), |
|
|
merchant=self._extract_merchant(text_lower), |
|
|
payment_method=self._extract_payment_method(text_lower), |
|
|
category=self._extract_category(text_lower), |
|
|
bank=self._extract_bank(text_lower), |
|
|
balance=self._extract_balance(text), |
|
|
raw_text=text, |
|
|
) |
|
|
|
|
|
logger.debug(f"Extracted entity: {entity.to_dict()}") |
|
|
return entity |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Extraction failed: {e}", exc_info=True) |
|
|
return FinancialEntity(raw_text=text) |
|
|
|
|
|
def extract_to_dict(self, text: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Extract entities and return as dictionary. |
|
|
|
|
|
Convenience method that extracts and converts to dict in one call. |
|
|
|
|
|
Args: |
|
|
text: The transaction message text. |
|
|
|
|
|
Returns: |
|
|
Dict[str, Any]: Dictionary of extracted entities. |
|
|
""" |
|
|
return self.extract(text).to_dict() |
|
|
|
|
|
def _extract_amount(self, text: str) -> Optional[str]: |
|
|
"""Extract transaction amount from text.""" |
|
|
for pattern in self.AMOUNT_PATTERNS: |
|
|
match = pattern.search(text) |
|
|
if match: |
|
|
amount = match.group(1).replace(',', '') |
|
|
logger.debug(f"Found amount: {amount}") |
|
|
return amount |
|
|
return None |
|
|
|
|
|
def _extract_type(self, text_lower: str) -> Optional[str]: |
|
|
"""Determine if transaction is debit or credit.""" |
|
|
|
|
|
for keyword in self.DEBIT_KEYWORDS: |
|
|
if keyword in text_lower: |
|
|
return 'debit' |
|
|
|
|
|
|
|
|
for keyword in self.CREDIT_KEYWORDS: |
|
|
if keyword in text_lower: |
|
|
return 'credit' |
|
|
|
|
|
return None |
|
|
|
|
|
def _extract_account(self, text: str) -> Optional[str]: |
|
|
"""Extract account number from text.""" |
|
|
for pattern in self.ACCOUNT_PATTERNS: |
|
|
match = pattern.search(text) |
|
|
if match: |
|
|
account = match.group(1) |
|
|
|
|
|
return account[-4:] if len(account) > 4 else account |
|
|
return None |
|
|
|
|
|
def _extract_date(self, text: str) -> Optional[str]: |
|
|
"""Extract transaction date from text.""" |
|
|
for pattern in self.DATE_PATTERNS: |
|
|
match = pattern.search(text) |
|
|
if match: |
|
|
return match.group(1) |
|
|
return None |
|
|
|
|
|
def _extract_reference(self, text: str) -> Optional[str]: |
|
|
"""Extract reference/transaction number.""" |
|
|
for pattern in self.REFERENCE_PATTERNS: |
|
|
match = pattern.search(text) |
|
|
if match: |
|
|
return match.group(1) |
|
|
return None |
|
|
|
|
|
def _extract_merchant(self, text_lower: str) -> Optional[str]: |
|
|
"""Identify merchant from text.""" |
|
|
for merchant, keywords in self.MERCHANTS.items(): |
|
|
for keyword in keywords: |
|
|
if keyword in text_lower: |
|
|
return merchant |
|
|
return None |
|
|
|
|
|
def _extract_payment_method(self, text_lower: str) -> Optional[str]: |
|
|
"""Detect payment method.""" |
|
|
for method, keywords in self.PAYMENT_PATTERNS.items(): |
|
|
for keyword in keywords: |
|
|
if keyword in text_lower: |
|
|
return method |
|
|
return None |
|
|
|
|
|
def _extract_category(self, text_lower: str) -> Optional[str]: |
|
|
"""Classify transaction category.""" |
|
|
for category, keywords in self.CATEGORY_KEYWORDS.items(): |
|
|
for keyword in keywords: |
|
|
if keyword in text_lower: |
|
|
return category |
|
|
return None |
|
|
|
|
|
def _extract_bank(self, text_lower: str) -> Optional[str]: |
|
|
"""Identify source bank.""" |
|
|
for bank, keywords in self.BANK_PATTERNS.items(): |
|
|
for keyword in keywords: |
|
|
if keyword in text_lower: |
|
|
return bank |
|
|
return None |
|
|
|
|
|
def _extract_balance(self, text: str) -> Optional[str]: |
|
|
"""Extract available balance if mentioned.""" |
|
|
patterns = [ |
|
|
re.compile(r'(?:bal(?:ance)?|avl\.?\s*bal)[:\s]*(?:Rs\.?|INR|₹)?\s*([\d,]+(?:\.\d{2})?)', re.IGNORECASE), |
|
|
] |
|
|
for pattern in patterns: |
|
|
match = pattern.search(text) |
|
|
if match: |
|
|
return match.group(1).replace(',', '') |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def extract_entities(text: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Convenience function to extract entities without instantiating class. |
|
|
|
|
|
Args: |
|
|
text: Transaction message text. |
|
|
|
|
|
Returns: |
|
|
Dict[str, Any]: Extracted entities as dictionary. |
|
|
|
|
|
Example: |
|
|
>>> from src.data.extractor import extract_entities |
|
|
>>> result = extract_entities("Rs.500 debited from account 1234") |
|
|
>>> print(result['amount']) |
|
|
'500' |
|
|
""" |
|
|
return EntityExtractor().extract_to_dict(text) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG) |
|
|
|
|
|
extractor = EntityExtractor(debug=True) |
|
|
|
|
|
test_cases = [ |
|
|
"HDFC Bank: Rs.2500.00 debited from A/c **3545 on 05-01-26 to VPA swiggy@ybl. Ref: 123456789012", |
|
|
"Dear Customer, INR 45000 credited to A/c 7890 on 04-01-2026. Salary from ACME Corp.", |
|
|
"You paid Rs.599 to Amazon from HDFC Bank a/c XX4567. UPI Ref: 987654321012", |
|
|
] |
|
|
|
|
|
for i, test in enumerate(test_cases, 1): |
|
|
print(f"\n{'='*60}") |
|
|
print(f"Test {i}: {test[:50]}...") |
|
|
result = extractor.extract(test) |
|
|
print(f"Result: {result.to_dict()}") |
|
|
print(f"Valid: {result.is_valid()}, Confidence: {result.confidence_score():.2%}") |
|
|
|