"""
Number Extractor Service
Handles extraction and normalization of numerical values from insurance documents.
Supports:
- Indian number formats (lakhs, crores)
- Currency symbols (₹, Rs., INR, USD)
- Comma-separated numbers
- Word numbers (One Hundred Million)
- Percentage values
"""

import re
from typing import Optional, List, Dict, Tuple
from decimal import Decimal, InvalidOperation


class NumberExtractor:
    """Extract and normalize numerical values from text."""
    
    # Indian number words
    WORD_TO_NUMBER = {
        'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
        'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
        'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
        'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17,
        'eighteen': 18, 'nineteen': 19, 'twenty': 20, 'thirty': 30,
        'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70,
        'eighty': 80, 'ninety': 90
    }
    
    MAGNITUDE_WORDS = {
        'hundred': 100,
        'thousand': 1000,
        'lakh': 100000,
        'lac': 100000,
        'lakhs': 100000,
        'lacs': 100000,
        'million': 1000000,
        'crore': 10000000,
        'crores': 10000000,
        'billion': 1000000000
    }
    
    # Currency patterns
    CURRENCY_PATTERNS = {
        'INR': [r'₹', r'Rs\.?', r'INR', r'Rupees?'],
        'USD': [r'\$', r'USD', r'Dollars?'],
        'EUR': [r'€', r'EUR', r'Euros?']
    }
    
    # Context keywords for identifying number types
    NUMBER_CONTEXTS = {
        'sum_insured': ['sum insured', 'total sum insured', 'tsi', 'si', 'insured value', 
                        'coverage amount', 'insured amount', 'sum assured'],
        'premium': ['premium', 'premium amount', 'total premium', 'net premium', 
                    'gross premium', 'annual premium'],
        'tax': ['tax', 'gst', 'cgst', 'sgst', 'igst', 'service tax'],
        'deductible': ['deductible', 'excess', 'franchise']
    }
    
    def __init__(self):
        self._compile_patterns()
    
    def _compile_patterns(self):
        """Compile regex patterns for number extraction."""
        # Currency amount: ₹1,00,000 or Rs. 1,00,000.00 or INR 100000
        currency_symbols = '|'.join(
            p for patterns in self.CURRENCY_PATTERNS.values() for p in patterns
        )
        self.pattern_currency = re.compile(
            rf'({currency_symbols})\s*([\d,]+(?:\.\d{{1,2}})?)',
            re.IGNORECASE
        )
        
        # Plain number with commas: 1,00,00,000 or 100,000,000
        self.pattern_number = re.compile(
            r'\b([\d,]+(?:\.\d+)?)\b'
        )
        
        # Number with magnitude words: 10 crore, 5.5 lakhs
        magnitude_words = '|'.join(self.MAGNITUDE_WORDS.keys())
        self.pattern_magnitude = re.compile(
            rf'\b([\d,.]+)\s*({magnitude_words})\b',
            re.IGNORECASE
        )
        
        # Percentage: 10%, 10.5 percent
        self.pattern_percent = re.compile(
            r'\b([\d.]+)\s*(?:%|percent|percentage)\b',
            re.IGNORECASE
        )
    
    def parse_number(self, num_str: str) -> Optional[float]:
        """
        Parse a number string to float, handling Indian format.
        
        Args:
            num_str: Number string (e.g., "1,00,000" or "100,000.50")
            
        Returns:
            Float value or None
        """
        if not num_str:
            return None
        
        # Remove commas and spaces
        num_str = str(num_str).replace(',', '').replace(' ', '').strip()
        
        try:
            return float(num_str)
        except ValueError:
            return None
    
    def parse_indian_number(self, text: str) -> Optional[float]:
        """
        Parse Indian number format (lakhs, crores).
        
        Args:
            text: Text like "10 crore" or "5.5 lakhs"
            
        Returns:
            Float value or None
        """
        text = text.lower().strip()
        
        match = self.pattern_magnitude.search(text)
        if match:
            num_part = self.parse_number(match.group(1))
            magnitude = self.MAGNITUDE_WORDS.get(match.group(2).lower(), 1)
            if num_part is not None:
                return num_part * magnitude
        
        return None
    
    def word_to_number(self, text: str) -> Optional[int]:
        """
        Convert word numbers to integers.
        
        Args:
            text: Text like "One Hundred Million"
            
        Returns:
            Integer value or None
        """
        text = text.lower().strip()
        words = text.split()
        
        if not words:
            return None
        
        result = 0
        current = 0
        
        for word in words:
            word = word.strip(',').strip()
            
            if word in self.WORD_TO_NUMBER:
                current += self.WORD_TO_NUMBER[word]
            elif word in self.MAGNITUDE_WORDS:
                magnitude = self.MAGNITUDE_WORDS[word]
                if magnitude >= 1000:
                    current = (current or 1) * magnitude
                    result += current
                    current = 0
                else:
                    current *= magnitude
            elif word == 'and':
                continue
            else:
                # Unknown word, try to parse as number
                try:
                    current += int(word)
                except ValueError:
                    pass
        
        result += current
        return result if result > 0 else None
    
    def extract_numbers(self, text: str) -> List[Dict]:
        """
        Extract all numerical values from text with context.
        
        Args:
            text: Text to search for numbers
            
        Returns:
            List of dicts with number info:
            [{"value": 101000000, "context": "sum_insured", "currency": "INR", 
              "original": "₹10,10,00,000"}]
        """
        if not text:
            return []
        
        results = []
        text_lower = text.lower()
        
        # Extract currency amounts
        for match in self.pattern_currency.finditer(text):
            currency_symbol = match.group(1)
            num_str = match.group(2)
            value = self.parse_number(num_str)
            
            if value is not None and value > 0:
                # Determine currency
                currency = 'INR'  # Default
                for curr, patterns in self.CURRENCY_PATTERNS.items():
                    if any(re.match(p, currency_symbol, re.IGNORECASE) for p in patterns):
                        currency = curr
                        break
                
                # Determine context
                context = self._determine_number_context(text_lower, match.start())
                
                results.append({
                    'value': value,
                    'context': context,
                    'currency': currency,
                    'original': match.group(),
                    'position': match.start()
                })
        
        # Extract numbers with magnitude words (10 crore, 5 lakhs)
        for match in self.pattern_magnitude.finditer(text):
            value = self.parse_indian_number(match.group())
            if value is not None and value > 0:
                context = self._determine_number_context(text_lower, match.start())
                results.append({
                    'value': value,
                    'context': context,
                    'currency': 'INR',  # Lakhs/crores are typically INR
                    'original': match.group(),
                    'position': match.start()
                })
        
        # Remove duplicates based on position (currency matches often overlap with magnitude)
        seen_positions = set()
        unique_results = []
        for r in sorted(results, key=lambda x: -x['value']):  # Prefer larger values
            # Check if any existing result overlaps with this one
            overlaps = False
            for pos in seen_positions:
                if abs(r['position'] - pos) < 20:  # Within 20 chars
                    overlaps = True
                    break
            
            if not overlaps:
                seen_positions.add(r['position'])
                unique_results.append(r)
        
        return unique_results
    
    def _determine_number_context(self, text: str, position: int) -> str:
        """Determine what type of number this is based on surrounding text."""
        # Look at 100 chars before the number
        context_start = max(0, position - 100)
        context_text = text[context_start:position]
        
        for num_type, keywords in self.NUMBER_CONTEXTS.items():
            if any(kw in context_text for kw in keywords):
                return num_type
        
        return 'unknown'
    
    def extract_sum_insured(self, text: str) -> Optional[float]:
        """Extract the sum insured value from text."""
        numbers = self.extract_numbers(text)
        
        # First, look for explicitly labeled sum insured
        for num in numbers:
            if num['context'] == 'sum_insured':
                return num['value']
        
        # Otherwise, return the largest number (likely to be sum insured)
        if numbers:
            return max(num['value'] for num in numbers)
        
        return None
    
    def extract_premium(self, text: str) -> Optional[float]:
        """Extract the premium amount from text."""
        numbers = self.extract_numbers(text)
        
        for num in numbers:
            if num['context'] == 'premium':
                return num['value']
        
        return None
    
    def calculate_sum(self, values: List[float]) -> float:
        """Calculate sum of values."""
        return sum(v for v in values if v is not None)
    
    def calculate_average(self, values: List[float]) -> Optional[float]:
        """Calculate average of values."""
        valid_values = [v for v in values if v is not None]
        if valid_values:
            return sum(valid_values) / len(valid_values)
        return None


# Singleton instance
number_extractor = NumberExtractor()