Spaces:
Running
Running
| """ | |
| Number Extractor Service | |
| Handles extraction and normalization of numerical values from insurance documents. | |
| Supports: | |
| - Indian number formats (lakhs, crores) | |
| - Currency symbols (₹, Rs., INR, USD) | |
| - Comma-separated numbers | |
| - Word numbers (One Hundred Million) | |
| - Percentage values | |
| """ | |
| import re | |
| from typing import Optional, List, Dict, Tuple | |
| from decimal import Decimal, InvalidOperation | |
| class NumberExtractor: | |
| """Extract and normalize numerical values from text.""" | |
| # Indian number words | |
| WORD_TO_NUMBER = { | |
| 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, | |
| 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, | |
| 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, | |
| 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, | |
| 'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30, | |
| 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, | |
| 'eighty': 80, 'ninety': 90 | |
| } | |
| MAGNITUDE_WORDS = { | |
| 'hundred': 100, | |
| 'thousand': 1000, | |
| 'lakh': 100000, | |
| 'lac': 100000, | |
| 'lakhs': 100000, | |
| 'lacs': 100000, | |
| 'million': 1000000, | |
| 'crore': 10000000, | |
| 'crores': 10000000, | |
| 'billion': 1000000000 | |
| } | |
| # Currency patterns | |
| CURRENCY_PATTERNS = { | |
| 'INR': [r'₹', r'Rs\.?', r'INR', r'Rupees?'], | |
| 'USD': [r'\$', r'USD', r'Dollars?'], | |
| 'EUR': [r'€', r'EUR', r'Euros?'] | |
| } | |
| # Context keywords for identifying number types | |
| NUMBER_CONTEXTS = { | |
| 'sum_insured': ['sum insured', 'total sum insured', 'tsi', 'si', 'insured value', | |
| 'coverage amount', 'insured amount', 'sum assured'], | |
| 'premium': ['premium', 'premium amount', 'total premium', 'net premium', | |
| 'gross premium', 'annual premium'], | |
| 'tax': ['tax', 'gst', 'cgst', 'sgst', 'igst', 'service tax'], | |
| 'deductible': ['deductible', 'excess', 'franchise'] | |
| } | |
| def __init__(self): | |
| self._compile_patterns() | |
| def _compile_patterns(self): | |
| """Compile regex patterns for number extraction.""" | |
| # Currency amount: ₹1,00,000 or Rs. 1,00,000.00 or INR 100000 | |
| currency_symbols = '|'.join( | |
| p for patterns in self.CURRENCY_PATTERNS.values() for p in patterns | |
| ) | |
| self.pattern_currency = re.compile( | |
| rf'({currency_symbols})\s*([\d,]+(?:\.\d{{1,2}})?)', | |
| re.IGNORECASE | |
| ) | |
| # Plain number with commas: 1,00,00,000 or 100,000,000 | |
| self.pattern_number = re.compile( | |
| r'\b([\d,]+(?:\.\d+)?)\b' | |
| ) | |
| # Number with magnitude words: 10 crore, 5.5 lakhs | |
| magnitude_words = '|'.join(self.MAGNITUDE_WORDS.keys()) | |
| self.pattern_magnitude = re.compile( | |
| rf'\b([\d,.]+)\s*({magnitude_words})\b', | |
| re.IGNORECASE | |
| ) | |
| # Percentage: 10%, 10.5 percent | |
| self.pattern_percent = re.compile( | |
| r'\b([\d.]+)\s*(?:%|percent|percentage)\b', | |
| re.IGNORECASE | |
| ) | |
| def parse_number(self, num_str: str) -> Optional[float]: | |
| """ | |
| Parse a number string to float, handling Indian format. | |
| Args: | |
| num_str: Number string (e.g., "1,00,000" or "100,000.50") | |
| Returns: | |
| Float value or None | |
| """ | |
| if not num_str: | |
| return None | |
| # Remove commas and spaces | |
| num_str = str(num_str).replace(',', '').replace(' ', '').strip() | |
| try: | |
| return float(num_str) | |
| except ValueError: | |
| return None | |
| def parse_indian_number(self, text: str) -> Optional[float]: | |
| """ | |
| Parse Indian number format (lakhs, crores). | |
| Args: | |
| text: Text like "10 crore" or "5.5 lakhs" | |
| Returns: | |
| Float value or None | |
| """ | |
| text = text.lower().strip() | |
| match = self.pattern_magnitude.search(text) | |
| if match: | |
| num_part = self.parse_number(match.group(1)) | |
| magnitude = self.MAGNITUDE_WORDS.get(match.group(2).lower(), 1) | |
| if num_part is not None: | |
| return num_part * magnitude | |
| return None | |
| def word_to_number(self, text: str) -> Optional[int]: | |
| """ | |
| Convert word numbers to integers. | |
| Args: | |
| text: Text like "One Hundred Million" | |
| Returns: | |
| Integer value or None | |
| """ | |
| text = text.lower().strip() | |
| words = text.split() | |
| if not words: | |
| return None | |
| result = 0 | |
| current = 0 | |
| for word in words: | |
| word = word.strip(',').strip() | |
| if word in self.WORD_TO_NUMBER: | |
| current += self.WORD_TO_NUMBER[word] | |
| elif word in self.MAGNITUDE_WORDS: | |
| magnitude = self.MAGNITUDE_WORDS[word] | |
| if magnitude >= 1000: | |
| current = (current or 1) * magnitude | |
| result += current | |
| current = 0 | |
| else: | |
| current *= magnitude | |
| elif word == 'and': | |
| continue | |
| else: | |
| # Unknown word, try to parse as number | |
| try: | |
| current += int(word) | |
| except ValueError: | |
| pass | |
| result += current | |
| return result if result > 0 else None | |
| def extract_numbers(self, text: str) -> List[Dict]: | |
| """ | |
| Extract all numerical values from text with context. | |
| Args: | |
| text: Text to search for numbers | |
| Returns: | |
| List of dicts with number info: | |
| [{"value": 101000000, "context": "sum_insured", "currency": "INR", | |
| "original": "₹10,10,00,000"}] | |
| """ | |
| if not text: | |
| return [] | |
| results = [] | |
| text_lower = text.lower() | |
| # Extract currency amounts | |
| for match in self.pattern_currency.finditer(text): | |
| currency_symbol = match.group(1) | |
| num_str = match.group(2) | |
| value = self.parse_number(num_str) | |
| if value is not None and value > 0: | |
| # Determine currency | |
| currency = 'INR' # Default | |
| for curr, patterns in self.CURRENCY_PATTERNS.items(): | |
| if any(re.match(p, currency_symbol, re.IGNORECASE) for p in patterns): | |
| currency = curr | |
| break | |
| # Determine context | |
| context = self._determine_number_context(text_lower, match.start()) | |
| results.append({ | |
| 'value': value, | |
| 'context': context, | |
| 'currency': currency, | |
| 'original': match.group(), | |
| 'position': match.start() | |
| }) | |
| # Extract numbers with magnitude words (10 crore, 5 lakhs) | |
| for match in self.pattern_magnitude.finditer(text): | |
| value = self.parse_indian_number(match.group()) | |
| if value is not None and value > 0: | |
| context = self._determine_number_context(text_lower, match.start()) | |
| results.append({ | |
| 'value': value, | |
| 'context': context, | |
| 'currency': 'INR', # Lakhs/crores are typically INR | |
| 'original': match.group(), | |
| 'position': match.start() | |
| }) | |
| # Remove duplicates based on position (currency matches often overlap with magnitude) | |
| seen_positions = set() | |
| unique_results = [] | |
| for r in sorted(results, key=lambda x: -x['value']): # Prefer larger values | |
| # Check if any existing result overlaps with this one | |
| overlaps = False | |
| for pos in seen_positions: | |
| if abs(r['position'] - pos) < 20: # Within 20 chars | |
| overlaps = True | |
| break | |
| if not overlaps: | |
| seen_positions.add(r['position']) | |
| unique_results.append(r) | |
| return unique_results | |
| def _determine_number_context(self, text: str, position: int) -> str: | |
| """Determine what type of number this is based on surrounding text.""" | |
| # Look at 100 chars before the number | |
| context_start = max(0, position - 100) | |
| context_text = text[context_start:position] | |
| for num_type, keywords in self.NUMBER_CONTEXTS.items(): | |
| if any(kw in context_text for kw in keywords): | |
| return num_type | |
| return 'unknown' | |
| def extract_sum_insured(self, text: str) -> Optional[float]: | |
| """Extract the sum insured value from text.""" | |
| numbers = self.extract_numbers(text) | |
| # First, look for explicitly labeled sum insured | |
| for num in numbers: | |
| if num['context'] == 'sum_insured': | |
| return num['value'] | |
| # Otherwise, return the largest number (likely to be sum insured) | |
| if numbers: | |
| return max(num['value'] for num in numbers) | |
| return None | |
| def extract_premium(self, text: str) -> Optional[float]: | |
| """Extract the premium amount from text.""" | |
| numbers = self.extract_numbers(text) | |
| for num in numbers: | |
| if num['context'] == 'premium': | |
| return num['value'] | |
| return None | |
| def calculate_sum(self, values: List[float]) -> float: | |
| """Calculate sum of values.""" | |
| return sum(v for v in values if v is not None) | |
| def calculate_average(self, values: List[float]) -> Optional[float]: | |
| """Calculate average of values.""" | |
| valid_values = [v for v in values if v is not None] | |
| if valid_values: | |
| return sum(valid_values) / len(valid_values) | |
| return None | |
| # Singleton instance | |
| number_extractor = NumberExtractor() | |