"""
Language Detection Module.

Provides multi-language detection for:
- English (en)
- Hindi (hi)
- Hinglish (code-mixed Hindi and English)

Uses langdetect library with custom Hinglish detection logic.
Performance target: <100ms per detection.
"""

import time
from typing import Tuple, Optional

import langdetect
from langdetect import detect_langs, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

from app.utils.logger import get_logger

logger = get_logger(__name__)

# Set seed for reproducible results in langdetect
DetectorFactory.seed = 0

# Supported language codes
SUPPORTED_LANGUAGES = {"en", "hi", "hinglish"}

# Default fallback values
DEFAULT_LANGUAGE = "en"
DEFAULT_CONFIDENCE = 0.3
ERROR_CONFIDENCE = 0.3

# Hinglish detection threshold - minimum ratio of each script type
HINGLISH_MIN_RATIO = 0.1


class LanguageDetector:
    """
    Language detection for English, Hindi, and Hinglish.
    
    Uses langdetect library with custom Hinglish detection logic.
    Thread-safe with deterministic results.
    
    Attributes:
        _initialized: Flag indicating successful initialization
    """
    
    def __init__(self) -> None:
        """
        Initialize the LanguageDetector.
        
        Sets the seed for reproducible results.
        """
        self._initialized = False
        try:
            # Ensure deterministic results
            DetectorFactory.seed = 0
            self._initialized = True
            logger.debug("LanguageDetector initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize LanguageDetector: {e}")
            self._initialized = False
    
    def detect(self, text: str) -> Tuple[str, float]:
        """
        Detect the language of input text.
        
        Args:
            text: Input text to analyze
            
        Returns:
            Tuple of (language_code, confidence)
            language_code: 'en', 'hi', or 'hinglish'
            confidence: 0.0-1.0
            
        Raises:
            No exceptions - returns fallback on error
        """
        return detect_language(text)
    
    def is_hinglish(self, text: str) -> bool:
        """
        Check if text is Hinglish (code-mixed).
        
        Hinglish is detected when text contains both:
        - Devanagari characters (Hindi script)
        - Latin characters (English script)
        
        Args:
            text: Input text
            
        Returns:
            True if text contains both Devanagari and Latin characters
        """
        return has_devanagari(text) and has_latin(text)
    
    def get_script_ratios(self, text: str) -> dict:
        """
        Calculate the ratio of different scripts in text.
        
        Args:
            text: Input text
            
        Returns:
            Dictionary with ratios for each script type
        """
        if not text:
            return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
        
        total_chars = len(text)
        devanagari_count = sum(1 for char in text if is_devanagari_char(char))
        latin_count = sum(1 for char in text if is_latin_char(char))
        other_count = total_chars - devanagari_count - latin_count
        
        return {
            "devanagari": devanagari_count / total_chars,
            "latin": latin_count / total_chars,
            "other": other_count / total_chars,
        }


def detect_language(text: str) -> Tuple[str, float]:
    """
    Detect language of text.
    
    Detection priority:
    1. Check for Hinglish (mixed scripts) first
    2. Use langdetect for primary detection
    3. Fallback to character-based detection if langdetect fails
    4. Default to English with low confidence on error
    
    Args:
        text: Input message
        
    Returns:
        Tuple of (language_code, confidence)
        language_code: 'en', 'hi', or 'hinglish'
        confidence: 0.0-1.0
    """
    start_time = time.time()
    
    # Validate input
    if not text or not text.strip():
        logger.debug("Empty text provided, returning default")
        return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)
    
    text = text.strip()
    
    try:
        # Step 1: Check for Hinglish (code-mixed) first
        # Hinglish contains both Devanagari and Latin characters
        has_dev = has_devanagari(text)
        has_lat = has_latin(text)
        
        if has_dev and has_lat:
            # Calculate script ratios for confidence
            ratios = _get_script_ratios(text)
            
            # Both scripts must have significant presence for Hinglish
            if ratios["devanagari"] >= HINGLISH_MIN_RATIO and ratios["latin"] >= HINGLISH_MIN_RATIO:
                confidence = min(0.95, 0.7 + (min(ratios["devanagari"], ratios["latin"]) * 2))
                _log_detection("hinglish", confidence, start_time)
                return ("hinglish", confidence)
        
        # Step 2: Use langdetect for primary detection
        detected_langs = detect_langs(text)
        
        if detected_langs:
            top_detection = detected_langs[0]
            lang_code = top_detection.lang
            confidence = top_detection.prob
            
            # Map to our supported categories
            if lang_code == "en":
                _log_detection("en", confidence, start_time)
                return ("en", confidence)
            elif lang_code == "hi":
                _log_detection("hi", confidence, start_time)
                return ("hi", confidence)
            else:
                # Unsupported language detected
                # Use character-based fallback
                return _character_based_detection(text, has_dev, has_lat, start_time)
        
        # No detection result
        return _character_based_detection(text, has_dev, has_lat, start_time)
        
    except LangDetectException as e:
        logger.debug(f"LangDetect exception: {e}")
        # Fallback to character-based detection
        return _character_based_detection(text, has_devanagari(text), has_latin(text), start_time)
        
    except Exception as e:
        logger.warning(f"Language detection error: {e}")
        _log_detection(DEFAULT_LANGUAGE, ERROR_CONFIDENCE, start_time)
        return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)


def _character_based_detection(
    text: str, 
    has_dev: bool, 
    has_lat: bool, 
    start_time: float
) -> Tuple[str, float]:
    """
    Fallback detection using character analysis.
    
    Args:
        text: Input text
        has_dev: Whether text contains Devanagari
        has_lat: Whether text contains Latin
        start_time: Detection start time for logging
        
    Returns:
        Tuple of (language_code, confidence)
    """
    if has_dev and has_lat:
        _log_detection("hinglish", 0.7, start_time)
        return ("hinglish", 0.7)
    elif has_dev:
        _log_detection("hi", 0.85, start_time)
        return ("hi", 0.85)
    elif has_lat:
        _log_detection("en", 0.75, start_time)
        return ("en", 0.75)
    else:
        # No recognizable characters
        _log_detection(DEFAULT_LANGUAGE, 0.5, start_time)
        return (DEFAULT_LANGUAGE, 0.5)


def _get_script_ratios(text: str) -> dict:
    """
    Calculate the ratio of different scripts in text.
    
    Args:
        text: Input text
        
    Returns:
        Dictionary with ratios for each script type
    """
    if not text:
        return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
    
    # Only count alphabetic characters (ignore spaces, numbers, punctuation)
    alpha_chars = [char for char in text if char.isalpha()]
    
    if not alpha_chars:
        return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
    
    total_alpha = len(alpha_chars)
    devanagari_count = sum(1 for char in alpha_chars if is_devanagari_char(char))
    latin_count = sum(1 for char in alpha_chars if is_latin_char(char))
    other_count = total_alpha - devanagari_count - latin_count
    
    return {
        "devanagari": devanagari_count / total_alpha,
        "latin": latin_count / total_alpha,
        "other": other_count / total_alpha,
    }


def _log_detection(lang: str, confidence: float, start_time: float) -> None:
    """Log detection result with timing."""
    elapsed_ms = (time.time() - start_time) * 1000
    logger.debug(f"Detected language: {lang}, confidence: {confidence:.2f}, time: {elapsed_ms:.2f}ms")


def has_devanagari(text: str) -> bool:
    """
    Check if text contains Devanagari characters.
    
    Devanagari Unicode range: U+0900 to U+097F
    
    Args:
        text: Input text
        
    Returns:
        True if text contains Devanagari Unicode characters
    """
    if not text:
        return False
    return any(is_devanagari_char(char) for char in text)


def has_latin(text: str) -> bool:
    """
    Check if text contains Latin characters.
    
    Args:
        text: Input text
        
    Returns:
        True if text contains ASCII letters (a-z, A-Z)
    """
    if not text:
        return False
    return any(is_latin_char(char) for char in text)


def is_devanagari_char(char: str) -> bool:
    """
    Check if a single character is Devanagari.
    
    Args:
        char: Single character
        
    Returns:
        True if character is in Devanagari Unicode range
    """
    return "\u0900" <= char <= "\u097F"


def is_latin_char(char: str) -> bool:
    """
    Check if a single character is Latin.
    
    Args:
        char: Single character
        
    Returns:
        True if character is ASCII letter
    """
    return "a" <= char.lower() <= "z"


def get_language_name(code: str) -> str:
    """
    Get human-readable language name from code.
    
    Args:
        code: Language code ('en', 'hi', 'hinglish')
        
    Returns:
        Human-readable language name
    """
    names = {
        "en": "English",
        "hi": "Hindi",
        "hinglish": "Hinglish (Code-Mixed)",
    }
    return names.get(code, "Unknown")