Spaces:

jashdoshi77
/

notebooklm-fast

Running

notebooklm-fast / services /number_extractor.py

jashdoshi77

feat: Add AI-powered query understanding with DeepSeek parsing

64deb3c 4 days ago

10.3 kB

	"""
	Number Extractor Service
	Handles extraction and normalization of numerical values from insurance documents.
	Supports:
	- Indian number formats (lakhs, crores)
	- Currency symbols (₹, Rs., INR, USD)
	- Comma-separated numbers
	- Word numbers (One Hundred Million)
	- Percentage values
	"""

	import re
	from typing import Optional, List, Dict, Tuple
	from decimal import Decimal, InvalidOperation


	class NumberExtractor:
	"""Extract and normalize numerical values from text."""

	# Indian number words
	WORD_TO_NUMBER = {
	'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
	'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
	'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
	'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17,
	'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30,
	'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70,
	'eighty': 80, 'ninety': 90
	}

	MAGNITUDE_WORDS = {
	'hundred': 100,
	'thousand': 1000,
	'lakh': 100000,
	'lac': 100000,
	'lakhs': 100000,
	'lacs': 100000,
	'million': 1000000,
	'crore': 10000000,
	'crores': 10000000,
	'billion': 1000000000
	}

	# Currency patterns
	CURRENCY_PATTERNS = {
	'INR': [r'₹', r'Rs\.?', r'INR', r'Rupees?'],
	'USD': [r'\$', r'USD', r'Dollars?'],
	'EUR': [r'€', r'EUR', r'Euros?']
	}

	# Context keywords for identifying number types
	NUMBER_CONTEXTS = {
	'sum_insured': ['sum insured', 'total sum insured', 'tsi', 'si', 'insured value',
	'coverage amount', 'insured amount', 'sum assured'],
	'premium': ['premium', 'premium amount', 'total premium', 'net premium',
	'gross premium', 'annual premium'],
	'tax': ['tax', 'gst', 'cgst', 'sgst', 'igst', 'service tax'],
	'deductible': ['deductible', 'excess', 'franchise']
	}

	def __init__(self):
	self._compile_patterns()

	def _compile_patterns(self):
	"""Compile regex patterns for number extraction."""
	# Currency amount: ₹1,00,000 or Rs. 1,00,000.00 or INR 100000
	currency_symbols = '\|'.join(
	p for patterns in self.CURRENCY_PATTERNS.values() for p in patterns
	)
	self.pattern_currency = re.compile(
	rf'({currency_symbols})\s*([\d,]+(?:\.\d{{1,2}})?)',
	re.IGNORECASE
	)

	# Plain number with commas: 1,00,00,000 or 100,000,000
	self.pattern_number = re.compile(
	r'\b([\d,]+(?:\.\d+)?)\b'
	)

	# Number with magnitude words: 10 crore, 5.5 lakhs
	magnitude_words = '\|'.join(self.MAGNITUDE_WORDS.keys())
	self.pattern_magnitude = re.compile(
	rf'\b([\d,.]+)\s*({magnitude_words})\b',
	re.IGNORECASE
	)

	# Percentage: 10%, 10.5 percent
	self.pattern_percent = re.compile(
	r'\b([\d.]+)\s*(?:%\|percent\|percentage)\b',
	re.IGNORECASE
	)

	def parse_number(self, num_str: str) -> Optional[float]:
	"""
	Parse a number string to float, handling Indian format.

	Args:
	num_str: Number string (e.g., "1,00,000" or "100,000.50")

	Returns:
	Float value or None
	"""
	if not num_str:
	return None

	# Remove commas and spaces
	num_str = str(num_str).replace(',', '').replace(' ', '').strip()

	try:
	return float(num_str)
	except ValueError:
	return None

	def parse_indian_number(self, text: str) -> Optional[float]:
	"""
	Parse Indian number format (lakhs, crores).

	Args:
	text: Text like "10 crore" or "5.5 lakhs"

	Returns:
	Float value or None
	"""
	text = text.lower().strip()

	match = self.pattern_magnitude.search(text)
	if match:
	num_part = self.parse_number(match.group(1))
	magnitude = self.MAGNITUDE_WORDS.get(match.group(2).lower(), 1)
	if num_part is not None:
	return num_part * magnitude

	return None

	def word_to_number(self, text: str) -> Optional[int]:
	"""
	Convert word numbers to integers.

	Args:
	text: Text like "One Hundred Million"

	Returns:
	Integer value or None
	"""
	text = text.lower().strip()
	words = text.split()

	if not words:
	return None

	result = 0
	current = 0

	for word in words:
	word = word.strip(',').strip()

	if word in self.WORD_TO_NUMBER:
	current += self.WORD_TO_NUMBER[word]
	elif word in self.MAGNITUDE_WORDS:
	magnitude = self.MAGNITUDE_WORDS[word]
	if magnitude >= 1000:
	current = (current or 1) * magnitude
	result += current
	current = 0
	else:
	current *= magnitude
	elif word == 'and':
	continue
	else:
	# Unknown word, try to parse as number
	try:
	current += int(word)
	except ValueError:
	pass

	result += current
	return result if result > 0 else None

	def extract_numbers(self, text: str) -> List[Dict]:
	"""
	Extract all numerical values from text with context.

	Args:
	text: Text to search for numbers

	Returns:
	List of dicts with number info:
	[{"value": 101000000, "context": "sum_insured", "currency": "INR",
	"original": "₹10,10,00,000"}]
	"""
	if not text:
	return []

	results = []
	text_lower = text.lower()

	# Extract currency amounts
	for match in self.pattern_currency.finditer(text):
	currency_symbol = match.group(1)
	num_str = match.group(2)
	value = self.parse_number(num_str)

	if value is not None and value > 0:
	# Determine currency
	currency = 'INR' # Default
	for curr, patterns in self.CURRENCY_PATTERNS.items():
	if any(re.match(p, currency_symbol, re.IGNORECASE) for p in patterns):
	currency = curr
	break

	# Determine context
	context = self._determine_number_context(text_lower, match.start())

	results.append({
	'value': value,
	'context': context,
	'currency': currency,
	'original': match.group(),
	'position': match.start()
	})

	# Extract numbers with magnitude words (10 crore, 5 lakhs)
	for match in self.pattern_magnitude.finditer(text):
	value = self.parse_indian_number(match.group())
	if value is not None and value > 0:
	context = self._determine_number_context(text_lower, match.start())
	results.append({
	'value': value,
	'context': context,
	'currency': 'INR', # Lakhs/crores are typically INR
	'original': match.group(),
	'position': match.start()
	})

	# Remove duplicates based on position (currency matches often overlap with magnitude)
	seen_positions = set()
	unique_results = []
	for r in sorted(results, key=lambda x: -x['value']): # Prefer larger values
	# Check if any existing result overlaps with this one
	overlaps = False
	for pos in seen_positions:
	if abs(r['position'] - pos) < 20: # Within 20 chars
	overlaps = True
	break

	if not overlaps:
	seen_positions.add(r['position'])
	unique_results.append(r)

	return unique_results

	def _determine_number_context(self, text: str, position: int) -> str:
	"""Determine what type of number this is based on surrounding text."""
	# Look at 100 chars before the number
	context_start = max(0, position - 100)
	context_text = text[context_start:position]

	for num_type, keywords in self.NUMBER_CONTEXTS.items():
	if any(kw in context_text for kw in keywords):
	return num_type

	return 'unknown'

	def extract_sum_insured(self, text: str) -> Optional[float]:
	"""Extract the sum insured value from text."""
	numbers = self.extract_numbers(text)

	# First, look for explicitly labeled sum insured
	for num in numbers:
	if num['context'] == 'sum_insured':
	return num['value']

	# Otherwise, return the largest number (likely to be sum insured)
	if numbers:
	return max(num['value'] for num in numbers)

	return None

	def extract_premium(self, text: str) -> Optional[float]:
	"""Extract the premium amount from text."""
	numbers = self.extract_numbers(text)

	for num in numbers:
	if num['context'] == 'premium':
	return num['value']

	return None

	def calculate_sum(self, values: List[float]) -> float:
	"""Calculate sum of values."""
	return sum(v for v in values if v is not None)

	def calculate_average(self, values: List[float]) -> Optional[float]:
	"""Calculate average of values."""
	valid_values = [v for v in values if v is not None]
	if valid_values:
	return sum(valid_values) / len(valid_values)
	return None


	# Singleton instance
	number_extractor = NumberExtractor()