Ranjit Behera
Clean up repo structure and add benchmark
6a76e07
"""
FinEE Regex Engine - Tier 1 pattern-based extraction.
High-performance regex patterns for extracting financial entities from
Indian banking messages. Covers HDFC, ICICI, SBI, Axis, Kotak and
payment apps (PhonePe, GPay, Paytm).
"""
import re
from typing import Optional, List, Tuple, Dict, Any
from dataclasses import dataclass
from .schema import ExtractionResult, TransactionType, ExtractionSource, FieldMeta
@dataclass
class RegexPattern:
"""A compiled regex pattern with metadata."""
name: str
pattern: re.Pattern
field: str
priority: int = 0 # Higher = preferred
extractor: callable = None # Optional post-processing
class RegexEngine:
"""
Tier 1 extraction engine using regex patterns.
Extracts: amount, date, reference, account, vpa, type
Does NOT extract: merchant, category (handled by Tier 2/3)
"""
def __init__(self):
"""Initialize regex patterns."""
self._patterns = self._compile_patterns()
def _compile_patterns(self) -> Dict[str, List[RegexPattern]]:
"""Compile all regex patterns organized by field."""
patterns = {
'amount': [
# Lakhs notation: 1.5 Lakh, 2 lacs, etc.
RegexPattern(
'amount_lakhs',
re.compile(r'([\d.]+)\s*(?:lakh|lac|L)s?\b', re.IGNORECASE),
'amount',
priority=15,
extractor=lambda m: str(float(m.group(1)) * 100000)
),
# Rs.2500.00 or Rs 2500 or INR 2,500.00 or ₹2,500
RegexPattern(
'amount_rs',
re.compile(r'(?:Rs\.?|INR|₹)\s*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE),
'amount',
priority=10
),
# 2500.00 debited/credited (amount before action, even without space)
RegexPattern(
'amount_action_before',
re.compile(r'([\d,]+(?:\.\d{1,2})?)\s*(?:has been\s+)?(?:debited|credited|transferred)', re.IGNORECASE),
'amount',
priority=5
),
# debited/credited 2500.00 (action before amount)
RegexPattern(
'amount_action_after',
re.compile(r'(?:debited|credited|transferred|spent)\s+(?:Rs\.?|INR|₹)?\s*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE),
'amount',
priority=5
),
# Amt: 2500 or Amount: 2500
RegexPattern(
'amount_label',
re.compile(r'(?:Amt|Amount)[:\s]*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE),
'amount',
priority=8
),
],
'type': [
# Explicit debit/credit
RegexPattern(
'type_explicit',
re.compile(r'\b(debited|debit|withdrawn|sent|paid|spent)\b', re.IGNORECASE),
'type',
priority=10,
extractor=lambda m: TransactionType.DEBIT
),
RegexPattern(
'type_credit',
re.compile(r'\b(credited|credit|received|refund|cashback|reversed)\b', re.IGNORECASE),
'type',
priority=10,
extractor=lambda m: TransactionType.CREDIT
),
],
'date': [
# DD-MM-YY or DD-MM-YYYY
RegexPattern(
'date_dmy',
re.compile(r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b'),
'date',
priority=10
),
# DD Mon YYYY (28 Dec 2025)
RegexPattern(
'date_text',
re.compile(r'\b(\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4})\b', re.IGNORECASE),
'date',
priority=8
),
# on DD/MM/YYYY at HH:MM
RegexPattern(
'date_on',
re.compile(r'on\s+(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', re.IGNORECASE),
'date',
priority=12
),
],
'reference': [
# UPI reference (12-16 digits)
RegexPattern(
'ref_upi',
re.compile(r'(?:Ref(?:erence)?|UTR|UPI\s*Ref)[:\s#]*(\d{12,16})', re.IGNORECASE),
'reference',
priority=10
),
# Transaction ID
RegexPattern(
'ref_txn',
re.compile(r'(?:Txn|Transaction)\s*(?:ID|No|#)?[:\s]*([A-Z0-9]{10,20})', re.IGNORECASE),
'reference',
priority=8
),
# Standalone 12-digit number (likely UPI ref)
RegexPattern(
'ref_standalone',
re.compile(r'\b(\d{12})\b'),
'reference',
priority=3 # Low priority, might be phone number
),
],
'account': [
# A/c XX1234 or Account 1234 or XXXXX1234
RegexPattern(
'account_ac',
re.compile(r'(?:A/c|Acct?|Account)(?:\s*(?:no\.?|number))?[:\s]*(?:[*X]{2,})?(\d{4,})', re.IGNORECASE),
'account',
priority=10
),
# from XXXX1234
RegexPattern(
'account_from',
re.compile(r'from\s+(?:[*X]{2,})?(\d{4,})', re.IGNORECASE),
'account',
priority=8
),
# ending with 1234
RegexPattern(
'account_ending',
re.compile(r'ending\s+(?:with\s+)?(\d{4})', re.IGNORECASE),
'account',
priority=6
),
],
'vpa': [
# UPI VPA (user@bank)
RegexPattern(
'vpa_upi',
re.compile(r'(?:VPA|to|from)\s+([a-zA-Z0-9._-]+@[a-zA-Z0-9]+)', re.IGNORECASE),
'vpa',
priority=10
),
# Standalone VPA pattern
RegexPattern(
'vpa_standalone',
re.compile(r'\b([a-zA-Z0-9._-]+@(?:ybl|paytm|okaxis|oksbi|okhdfcbank|axl|ibl|upi|apl|fbl|icici|hdfcbank|sbi))\b', re.IGNORECASE),
'vpa',
priority=8
),
],
'bank': [
# Bank names
RegexPattern(
'bank_name',
re.compile(r'\b(HDFC|ICICI|SBI|Axis|Kotak|PNB|BOB|IDFC|Yes Bank|IndusInd|RBL|Federal)\b', re.IGNORECASE),
'bank',
priority=10
),
],
'payment_method': [
# Payment methods
RegexPattern(
'method_upi',
re.compile(r'\b(UPI|IMPS|NEFT|RTGS|NACH)\b', re.IGNORECASE),
'payment_method',
priority=10
),
# Card
RegexPattern(
'method_card',
re.compile(r'\b(Debit Card|Credit Card|Card)\b', re.IGNORECASE),
'payment_method',
priority=8
),
],
}
return patterns
def extract(self, text: str) -> ExtractionResult:
"""
Extract all possible fields from text using regex.
Args:
text: Input text (bank SMS, email, etc.)
Returns:
ExtractionResult with extracted fields
"""
result = ExtractionResult(raw_input=text)
for field_name, patterns in self._patterns.items():
value = self._extract_field(text, patterns)
if value is not None:
# Handle amount parsing
if field_name == 'amount':
try:
# Remove commas and parse as float
value = float(value.replace(',', ''))
except (ValueError, AttributeError):
continue
setattr(result, field_name, value)
result.meta[field_name] = FieldMeta(
source=ExtractionSource.REGEX,
confidence=0.95,
raw_value=str(value)
)
return result
def _extract_field(self, text: str, patterns: List[RegexPattern]) -> Optional[Any]:
"""
Extract a single field using multiple patterns.
Returns the first match from the highest priority pattern.
"""
# Sort by priority (highest first)
sorted_patterns = sorted(patterns, key=lambda p: p.priority, reverse=True)
for pattern in sorted_patterns:
match = pattern.pattern.search(text)
if match:
if pattern.extractor:
return pattern.extractor(match)
else:
return match.group(1)
return None
def extract_all_matches(self, text: str, field: str) -> List[Tuple[str, int]]:
"""
Extract all matches for a specific field.
Returns list of (value, priority) tuples.
"""
if field not in self._patterns:
return []
matches = []
for pattern in self._patterns[field]:
for match in pattern.pattern.finditer(text):
value = match.group(1) if match.lastindex else match.group(0)
if pattern.extractor:
value = pattern.extractor(match)
matches.append((value, pattern.priority))
return sorted(matches, key=lambda x: x[1], reverse=True)
# Module-level singleton
_engine: Optional[RegexEngine] = None
def get_regex_engine() -> RegexEngine:
"""Get or create the global regex engine instance."""
global _engine
if _engine is None:
_engine = RegexEngine()
return _engine
def extract_with_regex(text: str) -> ExtractionResult:
"""Convenience function for extraction."""
return get_regex_engine().extract(text)