""" FinEE Regex Engine - Tier 1 pattern-based extraction. High-performance regex patterns for extracting financial entities from Indian banking messages. Covers HDFC, ICICI, SBI, Axis, Kotak and payment apps (PhonePe, GPay, Paytm). """ import re from typing import Optional, List, Tuple, Dict, Any from dataclasses import dataclass from .schema import ExtractionResult, TransactionType, ExtractionSource, FieldMeta @dataclass class RegexPattern: """A compiled regex pattern with metadata.""" name: str pattern: re.Pattern field: str priority: int = 0 # Higher = preferred extractor: callable = None # Optional post-processing class RegexEngine: """ Tier 1 extraction engine using regex patterns. Extracts: amount, date, reference, account, vpa, type Does NOT extract: merchant, category (handled by Tier 2/3) """ def __init__(self): """Initialize regex patterns.""" self._patterns = self._compile_patterns() def _compile_patterns(self) -> Dict[str, List[RegexPattern]]: """Compile all regex patterns organized by field.""" patterns = { 'amount': [ # Lakhs notation: 1.5 Lakh, 2 lacs, etc. RegexPattern( 'amount_lakhs', re.compile(r'([\d.]+)\s*(?:lakh|lac|L)s?\b', re.IGNORECASE), 'amount', priority=15, extractor=lambda m: str(float(m.group(1)) * 100000) ), # Rs.2500.00 or Rs 2500 or INR 2,500.00 or ₹2,500 RegexPattern( 'amount_rs', re.compile(r'(?:Rs\.?|INR|₹)\s*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE), 'amount', priority=10 ), # 2500.00 debited/credited (amount before action, even without space) RegexPattern( 'amount_action_before', re.compile(r'([\d,]+(?:\.\d{1,2})?)\s*(?:has been\s+)?(?:debited|credited|transferred)', re.IGNORECASE), 'amount', priority=5 ), # debited/credited 2500.00 (action before amount) RegexPattern( 'amount_action_after', re.compile(r'(?:debited|credited|transferred|spent)\s+(?:Rs\.?|INR|₹)?\s*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE), 'amount', priority=5 ), # Amt: 2500 or Amount: 2500 RegexPattern( 'amount_label', re.compile(r'(?:Amt|Amount)[:\s]*([\d,]+(?:\.\d{1,2})?)', re.IGNORECASE), 'amount', priority=8 ), ], 'type': [ # Explicit debit/credit RegexPattern( 'type_explicit', re.compile(r'\b(debited|debit|withdrawn|sent|paid|spent)\b', re.IGNORECASE), 'type', priority=10, extractor=lambda m: TransactionType.DEBIT ), RegexPattern( 'type_credit', re.compile(r'\b(credited|credit|received|refund|cashback|reversed)\b', re.IGNORECASE), 'type', priority=10, extractor=lambda m: TransactionType.CREDIT ), ], 'date': [ # DD-MM-YY or DD-MM-YYYY RegexPattern( 'date_dmy', re.compile(r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b'), 'date', priority=10 ), # DD Mon YYYY (28 Dec 2025) RegexPattern( 'date_text', re.compile(r'\b(\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4})\b', re.IGNORECASE), 'date', priority=8 ), # on DD/MM/YYYY at HH:MM RegexPattern( 'date_on', re.compile(r'on\s+(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', re.IGNORECASE), 'date', priority=12 ), ], 'reference': [ # UPI reference (12-16 digits) RegexPattern( 'ref_upi', re.compile(r'(?:Ref(?:erence)?|UTR|UPI\s*Ref)[:\s#]*(\d{12,16})', re.IGNORECASE), 'reference', priority=10 ), # Transaction ID RegexPattern( 'ref_txn', re.compile(r'(?:Txn|Transaction)\s*(?:ID|No|#)?[:\s]*([A-Z0-9]{10,20})', re.IGNORECASE), 'reference', priority=8 ), # Standalone 12-digit number (likely UPI ref) RegexPattern( 'ref_standalone', re.compile(r'\b(\d{12})\b'), 'reference', priority=3 # Low priority, might be phone number ), ], 'account': [ # A/c XX1234 or Account 1234 or XXXXX1234 RegexPattern( 'account_ac', re.compile(r'(?:A/c|Acct?|Account)(?:\s*(?:no\.?|number))?[:\s]*(?:[*X]{2,})?(\d{4,})', re.IGNORECASE), 'account', priority=10 ), # from XXXX1234 RegexPattern( 'account_from', re.compile(r'from\s+(?:[*X]{2,})?(\d{4,})', re.IGNORECASE), 'account', priority=8 ), # ending with 1234 RegexPattern( 'account_ending', re.compile(r'ending\s+(?:with\s+)?(\d{4})', re.IGNORECASE), 'account', priority=6 ), ], 'vpa': [ # UPI VPA (user@bank) RegexPattern( 'vpa_upi', re.compile(r'(?:VPA|to|from)\s+([a-zA-Z0-9._-]+@[a-zA-Z0-9]+)', re.IGNORECASE), 'vpa', priority=10 ), # Standalone VPA pattern RegexPattern( 'vpa_standalone', re.compile(r'\b([a-zA-Z0-9._-]+@(?:ybl|paytm|okaxis|oksbi|okhdfcbank|axl|ibl|upi|apl|fbl|icici|hdfcbank|sbi))\b', re.IGNORECASE), 'vpa', priority=8 ), ], 'bank': [ # Bank names RegexPattern( 'bank_name', re.compile(r'\b(HDFC|ICICI|SBI|Axis|Kotak|PNB|BOB|IDFC|Yes Bank|IndusInd|RBL|Federal)\b', re.IGNORECASE), 'bank', priority=10 ), ], 'payment_method': [ # Payment methods RegexPattern( 'method_upi', re.compile(r'\b(UPI|IMPS|NEFT|RTGS|NACH)\b', re.IGNORECASE), 'payment_method', priority=10 ), # Card RegexPattern( 'method_card', re.compile(r'\b(Debit Card|Credit Card|Card)\b', re.IGNORECASE), 'payment_method', priority=8 ), ], } return patterns def extract(self, text: str) -> ExtractionResult: """ Extract all possible fields from text using regex. Args: text: Input text (bank SMS, email, etc.) Returns: ExtractionResult with extracted fields """ result = ExtractionResult(raw_input=text) for field_name, patterns in self._patterns.items(): value = self._extract_field(text, patterns) if value is not None: # Handle amount parsing if field_name == 'amount': try: # Remove commas and parse as float value = float(value.replace(',', '')) except (ValueError, AttributeError): continue setattr(result, field_name, value) result.meta[field_name] = FieldMeta( source=ExtractionSource.REGEX, confidence=0.95, raw_value=str(value) ) return result def _extract_field(self, text: str, patterns: List[RegexPattern]) -> Optional[Any]: """ Extract a single field using multiple patterns. Returns the first match from the highest priority pattern. """ # Sort by priority (highest first) sorted_patterns = sorted(patterns, key=lambda p: p.priority, reverse=True) for pattern in sorted_patterns: match = pattern.pattern.search(text) if match: if pattern.extractor: return pattern.extractor(match) else: return match.group(1) return None def extract_all_matches(self, text: str, field: str) -> List[Tuple[str, int]]: """ Extract all matches for a specific field. Returns list of (value, priority) tuples. """ if field not in self._patterns: return [] matches = [] for pattern in self._patterns[field]: for match in pattern.pattern.finditer(text): value = match.group(1) if match.lastindex else match.group(0) if pattern.extractor: value = pattern.extractor(match) matches.append((value, pattern.priority)) return sorted(matches, key=lambda x: x[1], reverse=True) # Module-level singleton _engine: Optional[RegexEngine] = None def get_regex_engine() -> RegexEngine: """Get or create the global regex engine instance.""" global _engine if _engine is None: _engine = RegexEngine() return _engine def extract_with_regex(text: str) -> ExtractionResult: """Convenience function for extraction.""" return get_regex_engine().extract(text)