""" Number Extractor Service Handles extraction and normalization of numerical values from insurance documents. Supports: - Indian number formats (lakhs, crores) - Currency symbols (₹, Rs., INR, USD) - Comma-separated numbers - Word numbers (One Hundred Million) - Percentage values """ import re from typing import Optional, List, Dict, Tuple from decimal import Decimal, InvalidOperation class NumberExtractor: """Extract and normalize numerical values from text.""" # Indian number words WORD_TO_NUMBER = { 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90 } MAGNITUDE_WORDS = { 'hundred': 100, 'thousand': 1000, 'lakh': 100000, 'lac': 100000, 'lakhs': 100000, 'lacs': 100000, 'million': 1000000, 'crore': 10000000, 'crores': 10000000, 'billion': 1000000000 } # Currency patterns CURRENCY_PATTERNS = { 'INR': [r'₹', r'Rs\.?', r'INR', r'Rupees?'], 'USD': [r'\$', r'USD', r'Dollars?'], 'EUR': [r'€', r'EUR', r'Euros?'] } # Context keywords for identifying number types NUMBER_CONTEXTS = { 'sum_insured': ['sum insured', 'total sum insured', 'tsi', 'si', 'insured value', 'coverage amount', 'insured amount', 'sum assured'], 'premium': ['premium', 'premium amount', 'total premium', 'net premium', 'gross premium', 'annual premium'], 'tax': ['tax', 'gst', 'cgst', 'sgst', 'igst', 'service tax'], 'deductible': ['deductible', 'excess', 'franchise'] } def __init__(self): self._compile_patterns() def _compile_patterns(self): """Compile regex patterns for number extraction.""" # Currency amount: ₹1,00,000 or Rs. 1,00,000.00 or INR 100000 currency_symbols = '|'.join( p for patterns in self.CURRENCY_PATTERNS.values() for p in patterns ) self.pattern_currency = re.compile( rf'({currency_symbols})\s*([\d,]+(?:\.\d{{1,2}})?)', re.IGNORECASE ) # Plain number with commas: 1,00,00,000 or 100,000,000 self.pattern_number = re.compile( r'\b([\d,]+(?:\.\d+)?)\b' ) # Number with magnitude words: 10 crore, 5.5 lakhs magnitude_words = '|'.join(self.MAGNITUDE_WORDS.keys()) self.pattern_magnitude = re.compile( rf'\b([\d,.]+)\s*({magnitude_words})\b', re.IGNORECASE ) # Percentage: 10%, 10.5 percent self.pattern_percent = re.compile( r'\b([\d.]+)\s*(?:%|percent|percentage)\b', re.IGNORECASE ) def parse_number(self, num_str: str) -> Optional[float]: """ Parse a number string to float, handling Indian format. Args: num_str: Number string (e.g., "1,00,000" or "100,000.50") Returns: Float value or None """ if not num_str: return None # Remove commas and spaces num_str = str(num_str).replace(',', '').replace(' ', '').strip() try: return float(num_str) except ValueError: return None def parse_indian_number(self, text: str) -> Optional[float]: """ Parse Indian number format (lakhs, crores). Args: text: Text like "10 crore" or "5.5 lakhs" Returns: Float value or None """ text = text.lower().strip() match = self.pattern_magnitude.search(text) if match: num_part = self.parse_number(match.group(1)) magnitude = self.MAGNITUDE_WORDS.get(match.group(2).lower(), 1) if num_part is not None: return num_part * magnitude return None def word_to_number(self, text: str) -> Optional[int]: """ Convert word numbers to integers. Args: text: Text like "One Hundred Million" Returns: Integer value or None """ text = text.lower().strip() words = text.split() if not words: return None result = 0 current = 0 for word in words: word = word.strip(',').strip() if word in self.WORD_TO_NUMBER: current += self.WORD_TO_NUMBER[word] elif word in self.MAGNITUDE_WORDS: magnitude = self.MAGNITUDE_WORDS[word] if magnitude >= 1000: current = (current or 1) * magnitude result += current current = 0 else: current *= magnitude elif word == 'and': continue else: # Unknown word, try to parse as number try: current += int(word) except ValueError: pass result += current return result if result > 0 else None def extract_numbers(self, text: str) -> List[Dict]: """ Extract all numerical values from text with context. Args: text: Text to search for numbers Returns: List of dicts with number info: [{"value": 101000000, "context": "sum_insured", "currency": "INR", "original": "₹10,10,00,000"}] """ if not text: return [] results = [] text_lower = text.lower() # Extract currency amounts for match in self.pattern_currency.finditer(text): currency_symbol = match.group(1) num_str = match.group(2) value = self.parse_number(num_str) if value is not None and value > 0: # Determine currency currency = 'INR' # Default for curr, patterns in self.CURRENCY_PATTERNS.items(): if any(re.match(p, currency_symbol, re.IGNORECASE) for p in patterns): currency = curr break # Determine context context = self._determine_number_context(text_lower, match.start()) results.append({ 'value': value, 'context': context, 'currency': currency, 'original': match.group(), 'position': match.start() }) # Extract numbers with magnitude words (10 crore, 5 lakhs) for match in self.pattern_magnitude.finditer(text): value = self.parse_indian_number(match.group()) if value is not None and value > 0: context = self._determine_number_context(text_lower, match.start()) results.append({ 'value': value, 'context': context, 'currency': 'INR', # Lakhs/crores are typically INR 'original': match.group(), 'position': match.start() }) # Remove duplicates based on position (currency matches often overlap with magnitude) seen_positions = set() unique_results = [] for r in sorted(results, key=lambda x: -x['value']): # Prefer larger values # Check if any existing result overlaps with this one overlaps = False for pos in seen_positions: if abs(r['position'] - pos) < 20: # Within 20 chars overlaps = True break if not overlaps: seen_positions.add(r['position']) unique_results.append(r) return unique_results def _determine_number_context(self, text: str, position: int) -> str: """Determine what type of number this is based on surrounding text.""" # Look at 100 chars before the number context_start = max(0, position - 100) context_text = text[context_start:position] for num_type, keywords in self.NUMBER_CONTEXTS.items(): if any(kw in context_text for kw in keywords): return num_type return 'unknown' def extract_sum_insured(self, text: str) -> Optional[float]: """Extract the sum insured value from text.""" numbers = self.extract_numbers(text) # First, look for explicitly labeled sum insured for num in numbers: if num['context'] == 'sum_insured': return num['value'] # Otherwise, return the largest number (likely to be sum insured) if numbers: return max(num['value'] for num in numbers) return None def extract_premium(self, text: str) -> Optional[float]: """Extract the premium amount from text.""" numbers = self.extract_numbers(text) for num in numbers: if num['context'] == 'premium': return num['value'] return None def calculate_sum(self, values: List[float]) -> float: """Calculate sum of values.""" return sum(v for v in values if v is not None) def calculate_average(self, values: List[float]) -> Optional[float]: """Calculate average of values.""" valid_values = [v for v in values if v is not None] if valid_values: return sum(valid_values) / len(valid_values) return None # Singleton instance number_extractor = NumberExtractor()