finance-entity-extractor / src /finee /regex_engine.py

Ranjit Behera

Clean up repo structure and add benchmark

6a76e07 22 days ago

10.9 kB

	"""
	FinEE Regex Engine - Tier 1 pattern-based extraction.

	High-performance regex patterns for extracting financial entities from
	Indian banking messages. Covers HDFC, ICICI, SBI, Axis, Kotak and
	payment apps (PhonePe, GPay, Paytm).
	"""

	import re
	from typing import Optional, List, Tuple, Dict, Any
	from dataclasses import dataclass

	from .schema import ExtractionResult, TransactionType, ExtractionSource, FieldMeta


	@dataclass
	class RegexPattern:
	"""A compiled regex pattern with metadata."""
	name: str
	pattern: re.Pattern
	field: str
	priority: int = 0 # Higher = preferred
	extractor: callable = None # Optional post-processing


	class RegexEngine:
	"""
	Tier 1 extraction engine using regex patterns.

	Extracts: amount, date, reference, account, vpa, type
	Does NOT extract: merchant, category (handled by Tier 2/3)
	"""

	def __init__(self):
	"""Initialize regex patterns."""
	self._patterns = self._compile_patterns()

	def _compile_patterns(self) -> Dict[str, List[RegexPattern]]:
	"""Compile all regex patterns organized by field."""

	patterns = {
	'amount': [
	# Lakhs notation: 1.5 Lakh, 2 lacs, etc.
	RegexPattern(
	'amount_lakhs',
	re.compile(r'([\d.]+)\s*(?:lakh\|lac\|L)s?\b', re.IGNORECASE),
	'amount',
	priority=15,
	extractor=lambda m: str(float(m.group(1)) * 100000)
	),
	# Rs.2500.00 or Rs 2500 or INR 2,500.00 or ₹2,500
	RegexPattern(
	'amount_rs',
	re.compile(r'(?:Rs\.?\|INR\|₹)\s*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE),
	'amount',
	priority=10
	),
	# 2500.00 debited/credited (amount before action, even without space)
	RegexPattern(
	'amount_action_before',
	re.compile(r'([\d,]+(?:\.\d{1,2})?)\s*(?:has been\s+)?(?:debited\|credited\|transferred)', re.IGNORECASE),
	'amount',
	priority=5
	),
	# debited/credited 2500.00 (action before amount)
	RegexPattern(
	'amount_action_after',
	re.compile(r'(?:debited\|credited\|transferred\|spent)\s+(?:Rs\.?\|INR\|₹)?\s*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE),
	'amount',
	priority=5
	),
	# Amt: 2500 or Amount: 2500
	RegexPattern(
	'amount_label',
	re.compile(r'(?:Amt\|Amount)[:\s]*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE),
	'amount',
	priority=8
	),
	],

	'type': [
	# Explicit debit/credit
	RegexPattern(
	'type_explicit',
	re.compile(r'\b(debited\|debit\|withdrawn\|sent\|paid\|spent)\b', re.IGNORECASE),
	'type',
	priority=10,
	extractor=lambda m: TransactionType.DEBIT
	),
	RegexPattern(
	'type_credit',
	re.compile(r'\b(credited\|credit\|received\|refund\|cashback\|reversed)\b', re.IGNORECASE),
	'type',
	priority=10,
	extractor=lambda m: TransactionType.CREDIT
	),
	],

	'date': [
	# DD-MM-YY or DD-MM-YYYY
	RegexPattern(
	'date_dmy',
	re.compile(r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b'),
	'date',
	priority=10
	),
	# DD Mon YYYY (28 Dec 2025)
	RegexPattern(
	'date_text',
	re.compile(r'\b(\d{1,2}\s+(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+\d{2,4})\b', re.IGNORECASE),
	'date',
	priority=8
	),
	# on DD/MM/YYYY at HH:MM
	RegexPattern(
	'date_on',
	re.compile(r'on\s+(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', re.IGNORECASE),
	'date',
	priority=12
	),
	],

	'reference': [
	# UPI reference (12-16 digits)
	RegexPattern(
	'ref_upi',
	re.compile(r'(?:Ref(?:erence)?\|UTR\|UPI\sRef)[:\s#](\d{12,16})', re.IGNORECASE),
	'reference',
	priority=10
	),
	# Transaction ID
	RegexPattern(
	'ref_txn',
	re.compile(r'(?:Txn\|Transaction)\s(?:ID\|No\|#)?[:\s]([A-Z0-9]{10,20})', re.IGNORECASE),
	'reference',
	priority=8
	),
	# Standalone 12-digit number (likely UPI ref)
	RegexPattern(
	'ref_standalone',
	re.compile(r'\b(\d{12})\b'),
	'reference',
	priority=3 # Low priority, might be phone number
	),
	],

	'account': [
	# A/c XX1234 or Account 1234 or XXXXX1234
	RegexPattern(
	'account_ac',
	re.compile(r'(?:A/c\|Acct?\|Account)(?:\s(?:no\.?\|number))?[:\s](?:[*X]{2,})?(\d{4,})', re.IGNORECASE),
	'account',
	priority=10
	),
	# from XXXX1234
	RegexPattern(
	'account_from',
	re.compile(r'from\s+(?:[*X]{2,})?(\d{4,})', re.IGNORECASE),
	'account',
	priority=8
	),
	# ending with 1234
	RegexPattern(
	'account_ending',
	re.compile(r'ending\s+(?:with\s+)?(\d{4})', re.IGNORECASE),
	'account',
	priority=6
	),
	],

	'vpa': [
	# UPI VPA (user@bank)
	RegexPattern(
	'vpa_upi',
	re.compile(r'(?:VPA\|to\|from)\s+([a-zA-Z0-9._-]+@[a-zA-Z0-9]+)', re.IGNORECASE),
	'vpa',
	priority=10
	),
	# Standalone VPA pattern
	RegexPattern(
	'vpa_standalone',
	re.compile(r'\b([a-zA-Z0-9._-]+@(?:ybl\|paytm\|okaxis\|oksbi\|okhdfcbank\|axl\|ibl\|upi\|apl\|fbl\|icici\|hdfcbank\|sbi))\b', re.IGNORECASE),
	'vpa',
	priority=8
	),
	],

	'bank': [
	# Bank names
	RegexPattern(
	'bank_name',
	re.compile(r'\b(HDFC\|ICICI\|SBI\|Axis\|Kotak\|PNB\|BOB\|IDFC\|Yes Bank\|IndusInd\|RBL\|Federal)\b', re.IGNORECASE),
	'bank',
	priority=10
	),
	],

	'payment_method': [
	# Payment methods
	RegexPattern(
	'method_upi',
	re.compile(r'\b(UPI\|IMPS\|NEFT\|RTGS\|NACH)\b', re.IGNORECASE),
	'payment_method',
	priority=10
	),
	# Card
	RegexPattern(
	'method_card',
	re.compile(r'\b(Debit Card\|Credit Card\|Card)\b', re.IGNORECASE),
	'payment_method',
	priority=8
	),
	],
	}

	return patterns

	def extract(self, text: str) -> ExtractionResult:
	"""
	Extract all possible fields from text using regex.

	Args:
	text: Input text (bank SMS, email, etc.)

	Returns:
	ExtractionResult with extracted fields
	"""
	result = ExtractionResult(raw_input=text)

	for field_name, patterns in self._patterns.items():
	value = self._extract_field(text, patterns)
	if value is not None:
	# Handle amount parsing
	if field_name == 'amount':
	try:
	# Remove commas and parse as float
	value = float(value.replace(',', ''))
	except (ValueError, AttributeError):
	continue

	setattr(result, field_name, value)
	result.meta[field_name] = FieldMeta(
	source=ExtractionSource.REGEX,
	confidence=0.95,
	raw_value=str(value)
	)

	return result

	def _extract_field(self, text: str, patterns: List[RegexPattern]) -> Optional[Any]:
	"""
	Extract a single field using multiple patterns.

	Returns the first match from the highest priority pattern.
	"""
	# Sort by priority (highest first)
	sorted_patterns = sorted(patterns, key=lambda p: p.priority, reverse=True)

	for pattern in sorted_patterns:
	match = pattern.pattern.search(text)
	if match:
	if pattern.extractor:
	return pattern.extractor(match)
	else:
	return match.group(1)

	return None

	def extract_all_matches(self, text: str, field: str) -> List[Tuple[str, int]]:
	"""
	Extract all matches for a specific field.

	Returns list of (value, priority) tuples.
	"""
	if field not in self._patterns:
	return []

	matches = []
	for pattern in self._patterns[field]:
	for match in pattern.pattern.finditer(text):
	value = match.group(1) if match.lastindex else match.group(0)
	if pattern.extractor:
	value = pattern.extractor(match)
	matches.append((value, pattern.priority))

	return sorted(matches, key=lambda x: x[1], reverse=True)


	# Module-level singleton
	_engine: Optional[RegexEngine] = None


	def get_regex_engine() -> RegexEngine:
	"""Get or create the global regex engine instance."""
	global _engine
	if _engine is None:
	_engine = RegexEngine()
	return _engine


	def extract_with_regex(text: str) -> ExtractionResult:
	"""Convenience function for extraction."""
	return get_regex_engine().extract(text)