notebooklm-fast / services /number_extractor.py
jashdoshi77
feat: Add AI-powered query understanding with DeepSeek parsing
64deb3c
"""
Number Extractor Service
Handles extraction and normalization of numerical values from insurance documents.
Supports:
- Indian number formats (lakhs, crores)
- Currency symbols (₹, Rs., INR, USD)
- Comma-separated numbers
- Word numbers (One Hundred Million)
- Percentage values
"""
import re
from typing import Optional, List, Dict, Tuple
from decimal import Decimal, InvalidOperation
class NumberExtractor:
"""Extract and normalize numerical values from text."""
# Indian number words
WORD_TO_NUMBER = {
'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17,
'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30,
'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70,
'eighty': 80, 'ninety': 90
}
MAGNITUDE_WORDS = {
'hundred': 100,
'thousand': 1000,
'lakh': 100000,
'lac': 100000,
'lakhs': 100000,
'lacs': 100000,
'million': 1000000,
'crore': 10000000,
'crores': 10000000,
'billion': 1000000000
}
# Currency patterns
CURRENCY_PATTERNS = {
'INR': [r'₹', r'Rs\.?', r'INR', r'Rupees?'],
'USD': [r'\$', r'USD', r'Dollars?'],
'EUR': [r'€', r'EUR', r'Euros?']
}
# Context keywords for identifying number types
NUMBER_CONTEXTS = {
'sum_insured': ['sum insured', 'total sum insured', 'tsi', 'si', 'insured value',
'coverage amount', 'insured amount', 'sum assured'],
'premium': ['premium', 'premium amount', 'total premium', 'net premium',
'gross premium', 'annual premium'],
'tax': ['tax', 'gst', 'cgst', 'sgst', 'igst', 'service tax'],
'deductible': ['deductible', 'excess', 'franchise']
}
def __init__(self):
self._compile_patterns()
def _compile_patterns(self):
"""Compile regex patterns for number extraction."""
# Currency amount: ₹1,00,000 or Rs. 1,00,000.00 or INR 100000
currency_symbols = '|'.join(
p for patterns in self.CURRENCY_PATTERNS.values() for p in patterns
)
self.pattern_currency = re.compile(
rf'({currency_symbols})\s*([\d,]+(?:\.\d{{1,2}})?)',
re.IGNORECASE
)
# Plain number with commas: 1,00,00,000 or 100,000,000
self.pattern_number = re.compile(
r'\b([\d,]+(?:\.\d+)?)\b'
)
# Number with magnitude words: 10 crore, 5.5 lakhs
magnitude_words = '|'.join(self.MAGNITUDE_WORDS.keys())
self.pattern_magnitude = re.compile(
rf'\b([\d,.]+)\s*({magnitude_words})\b',
re.IGNORECASE
)
# Percentage: 10%, 10.5 percent
self.pattern_percent = re.compile(
r'\b([\d.]+)\s*(?:%|percent|percentage)\b',
re.IGNORECASE
)
def parse_number(self, num_str: str) -> Optional[float]:
"""
Parse a number string to float, handling Indian format.
Args:
num_str: Number string (e.g., "1,00,000" or "100,000.50")
Returns:
Float value or None
"""
if not num_str:
return None
# Remove commas and spaces
num_str = str(num_str).replace(',', '').replace(' ', '').strip()
try:
return float(num_str)
except ValueError:
return None
def parse_indian_number(self, text: str) -> Optional[float]:
"""
Parse Indian number format (lakhs, crores).
Args:
text: Text like "10 crore" or "5.5 lakhs"
Returns:
Float value or None
"""
text = text.lower().strip()
match = self.pattern_magnitude.search(text)
if match:
num_part = self.parse_number(match.group(1))
magnitude = self.MAGNITUDE_WORDS.get(match.group(2).lower(), 1)
if num_part is not None:
return num_part * magnitude
return None
def word_to_number(self, text: str) -> Optional[int]:
"""
Convert word numbers to integers.
Args:
text: Text like "One Hundred Million"
Returns:
Integer value or None
"""
text = text.lower().strip()
words = text.split()
if not words:
return None
result = 0
current = 0
for word in words:
word = word.strip(',').strip()
if word in self.WORD_TO_NUMBER:
current += self.WORD_TO_NUMBER[word]
elif word in self.MAGNITUDE_WORDS:
magnitude = self.MAGNITUDE_WORDS[word]
if magnitude >= 1000:
current = (current or 1) * magnitude
result += current
current = 0
else:
current *= magnitude
elif word == 'and':
continue
else:
# Unknown word, try to parse as number
try:
current += int(word)
except ValueError:
pass
result += current
return result if result > 0 else None
def extract_numbers(self, text: str) -> List[Dict]:
"""
Extract all numerical values from text with context.
Args:
text: Text to search for numbers
Returns:
List of dicts with number info:
[{"value": 101000000, "context": "sum_insured", "currency": "INR",
"original": "₹10,10,00,000"}]
"""
if not text:
return []
results = []
text_lower = text.lower()
# Extract currency amounts
for match in self.pattern_currency.finditer(text):
currency_symbol = match.group(1)
num_str = match.group(2)
value = self.parse_number(num_str)
if value is not None and value > 0:
# Determine currency
currency = 'INR' # Default
for curr, patterns in self.CURRENCY_PATTERNS.items():
if any(re.match(p, currency_symbol, re.IGNORECASE) for p in patterns):
currency = curr
break
# Determine context
context = self._determine_number_context(text_lower, match.start())
results.append({
'value': value,
'context': context,
'currency': currency,
'original': match.group(),
'position': match.start()
})
# Extract numbers with magnitude words (10 crore, 5 lakhs)
for match in self.pattern_magnitude.finditer(text):
value = self.parse_indian_number(match.group())
if value is not None and value > 0:
context = self._determine_number_context(text_lower, match.start())
results.append({
'value': value,
'context': context,
'currency': 'INR', # Lakhs/crores are typically INR
'original': match.group(),
'position': match.start()
})
# Remove duplicates based on position (currency matches often overlap with magnitude)
seen_positions = set()
unique_results = []
for r in sorted(results, key=lambda x: -x['value']): # Prefer larger values
# Check if any existing result overlaps with this one
overlaps = False
for pos in seen_positions:
if abs(r['position'] - pos) < 20: # Within 20 chars
overlaps = True
break
if not overlaps:
seen_positions.add(r['position'])
unique_results.append(r)
return unique_results
def _determine_number_context(self, text: str, position: int) -> str:
"""Determine what type of number this is based on surrounding text."""
# Look at 100 chars before the number
context_start = max(0, position - 100)
context_text = text[context_start:position]
for num_type, keywords in self.NUMBER_CONTEXTS.items():
if any(kw in context_text for kw in keywords):
return num_type
return 'unknown'
def extract_sum_insured(self, text: str) -> Optional[float]:
"""Extract the sum insured value from text."""
numbers = self.extract_numbers(text)
# First, look for explicitly labeled sum insured
for num in numbers:
if num['context'] == 'sum_insured':
return num['value']
# Otherwise, return the largest number (likely to be sum insured)
if numbers:
return max(num['value'] for num in numbers)
return None
def extract_premium(self, text: str) -> Optional[float]:
"""Extract the premium amount from text."""
numbers = self.extract_numbers(text)
for num in numbers:
if num['context'] == 'premium':
return num['value']
return None
def calculate_sum(self, values: List[float]) -> float:
"""Calculate sum of values."""
return sum(v for v in values if v is not None)
def calculate_average(self, values: List[float]) -> Optional[float]:
"""Calculate average of values."""
valid_values = [v for v in values if v is not None]
if valid_values:
return sum(valid_values) / len(valid_values)
return None
# Singleton instance
number_extractor = NumberExtractor()