"""
Configuration module for MLOps platform.
Contains all configuration classes and constants.
"""

from dataclasses import dataclass, field
from typing import Dict, List, Optional
from enum import Enum


class LanguageCode(str, Enum):
    """Supported language codes."""
    ENGLISH = "en"
    CHINESE = "zh"
    KHMER = "km"


class ClassificationType(str, Enum):
    """Classification task types."""
    BINARY = "binary"
    MULTICLASS = "multiclass"


# Supported languages with display names
SUPPORTED_LANGUAGES: Dict[str, Dict[str, str]] = {
    "en": {
        "name": "English",
        "native_name": "English",
        "description": "English language support with standard NLP preprocessing",
        "tokenizer_hint": "Uses standard word tokenization"
    },
    "zh": {
        "name": "Chinese",
        "native_name": "中文",
        "description": "Chinese language support with character-level tokenization",
        "tokenizer_hint": "Uses jieba for word segmentation"
    },
    "km": {
        "name": "Khmer",
        "native_name": "ភាសាខ្មែរ",
        "description": "Khmer language support with specialized tokenization",
        "tokenizer_hint": "Uses ICU-based tokenization for Khmer script"
    }
}

# Model architectures supported with recommendations
MODEL_ARCHITECTURES = {
    "roberta-base": {
        "name": "RoBERTa Base",
        "description": "Robust BERT model, excellent for English text classification",
        "languages": ["en"],
        "max_length": 512,
        "recommended_for": "English only, high accuracy needed",
        "speed": "Medium",
        "size": "355MB",
        "best_use": "English binary/multiclass classification"
    },
    "bert-base-multilingual-cased": {
        "name": "mBERT (Multilingual BERT)",
        "description": "Supports 104 languages - Good balance of performance and multilingual support",
        "languages": ["en", "zh", "km"],
        "max_length": 512,
        "recommended_for": "Multilingual tasks, balanced performance",
        "speed": "Medium",
        "size": "665MB",
        "best_use": "Multilingual classification, good general-purpose model"
    },
    "xlm-roberta-base": {
        "name": "XLM-RoBERTa Base",
        "description": "Best multilingual model - Highest accuracy for Chinese, Khmer, and other languages",
        "languages": ["en", "zh", "km"],
        "max_length": 512,
        "recommended_for": "Best multilingual performance, recommended for Chinese/Khmer",
        "speed": "Medium-Slow",
        "size": "1.03GB",
        "best_use": "When you need the best accuracy across multiple languages"
    },
    "distilbert-base-multilingual-cased": {
        "name": "DistilBERT Multilingual (Recommended for CPU)",
        "description": "Lightweight and fast - Perfect for CPU training or quick experiments",
        "languages": ["en", "zh", "km"],
        "max_length": 512,
        "recommended_for": "CPU training, fast experiments, limited resources",
        "speed": "Fast",
        "size": "525MB",
        "best_use": "CPU-only systems, quick prototyping, limited GPU memory"
    }
}

# Model selection guide
MODEL_SELECTION_GUIDE = {
    "cpu_training": "distilbert-base-multilingual-cased",
    "gpu_training_english": "roberta-base",
    "gpu_training_multilingual": "xlm-roberta-base",
    "quick_experiment": "distilbert-base-multilingual-cased",
    "production_english": "roberta-base",
    "production_multilingual": "xlm-roberta-base"
}


@dataclass
class TrainingConfig:
    """Configuration for model training."""
    
    # Model settings
    model_name: str = "bert-base-multilingual-cased"
    num_labels: int = 2
    
    # Training hyperparameters
    learning_rate: float = 2e-5
    batch_size: int = 16
    num_epochs: int = 3
    warmup_ratio: float = 0.1
    weight_decay: float = 0.01
    max_length: int = 256
    
    # Data settings
    train_split: float = 0.8
    validation_split: float = 0.1
    test_split: float = 0.1
    shuffle_data: bool = True
    random_seed: int = 42
    
    # Language settings
    language: str = "en"
    
    # Output settings
    output_dir: str = "trained_models"
    save_best_model: bool = True
    logging_steps: int = 10
    eval_strategy: str = "epoch"
    
    # Performance settings
    use_fp16: bool = False  # Disabled for CPU compatibility
    gradient_accumulation_steps: int = 1
    
    # Labels configuration
    label_names: List[str] = field(default_factory=lambda: ["Legitimate", "Phishing"])
    
    def validate(self) -> List[str]:
        """Validate configuration and return list of warnings/errors."""
        issues = []
        
        if self.learning_rate <= 0:
            issues.append("Learning rate must be positive")
        if self.batch_size < 1:
            issues.append("Batch size must be at least 1")
        if self.num_epochs < 1:
            issues.append("Number of epochs must be at least 1")
        if self.train_split + self.validation_split + self.test_split > 1.0:
            issues.append("Sum of data splits cannot exceed 1.0")
        if self.language not in SUPPORTED_LANGUAGES:
            issues.append(f"Unsupported language: {self.language}")
            
        return issues
    
    def to_dict(self) -> dict:
        """Convert config to dictionary."""
        return {
            "model_name": self.model_name,
            "num_labels": self.num_labels,
            "learning_rate": self.learning_rate,
            "batch_size": self.batch_size,
            "num_epochs": self.num_epochs,
            "warmup_ratio": self.warmup_ratio,
            "weight_decay": self.weight_decay,
            "max_length": self.max_length,
            "train_split": self.train_split,
            "validation_split": self.validation_split,
            "test_split": self.test_split,
            "shuffle_data": self.shuffle_data,
            "random_seed": self.random_seed,
            "language": self.language,
            "output_dir": self.output_dir,
            "label_names": self.label_names
        }


@dataclass  
class ExperimentConfig:
    """Configuration for experiment tracking."""
    
    experiment_name: str = "content_detection"
    run_name: Optional[str] = None
    tags: Dict[str, str] = field(default_factory=dict)
    description: str = ""
    
    # MLflow settings (optional)
    use_mlflow: bool = False
    mlflow_tracking_uri: str = "mlruns"
    

# UI Translation strings
UI_TRANSLATIONS = {
    "en": {
        "app_title": "MLOps Training Platform",
        "sidebar_title": "Configuration",
        "language_select": "Select Target Language",
        "upload_data": "Upload Dataset",
        "training_config": "Training Configuration",
        "start_training": "Start Training",
        "training_progress": "Training Progress",
        "evaluation": "Model Evaluation",
        "download_model": "Download Model",
        "upload_help": "Upload a CSV file with 'text' and 'label' columns",
        "metrics_title": "Training Metrics",
        "confusion_matrix": "Confusion Matrix",
        "success_msg": "Training completed successfully!",
        "error_msg": "An error occurred during training",
        "welcome_msg": "Welcome to the MLOps Training Platform",
        "data_preview": "Data Preview",
        "class_distribution": "Class Distribution"
    },
    "zh": {
        "app_title": "🤖 机器学习运维训练平台",
        "sidebar_title": "配置",
        "language_select": "选择目标语言",
        "upload_data": "上传数据集",
        "training_config": "训练配置",
        "start_training": "开始训练",
        "training_progress": "训练进度",
        "evaluation": "模型评估",
        "download_model": "下载模型",
        "upload_help": "上传包含 'text' 和 'label' 列的CSV文件",
        "metrics_title": "训练指标",
        "confusion_matrix": "混淆矩阵",
        "success_msg": "训练成功完成！",
        "error_msg": "训练过程中发生错误",
        "welcome_msg": "欢迎使用机器学习运维训练平台",
        "data_preview": "数据预览",
        "class_distribution": "类别分布"
    },
    "km": {
        "app_title": "🤖 វេទិកាបណ្តុះបណ្តាល MLOps",
        "sidebar_title": "ការកំណត់",
        "language_select": "ជ្រើសរើសភាសាគោលដៅ",
        "upload_data": "ផ្ទុកឡើងសំណុំទិន្នន័យ",
        "training_config": "ការកំណត់ការបណ្តុះបណ្តាល",
        "start_training": "ចាប់ផ្តើមបណ្តុះបណ្តាល",
        "training_progress": "វឌ្ឍនភាពនៃការបណ្តុះបណ្តាល",
        "evaluation": "ការវាយតម្លៃម៉ូដែល",
        "download_model": "ទាញយកម៉ូដែល",
        "upload_help": "ផ្ទុកឡើងឯកសារ CSV ដែលមានជួរឈរ 'text' និង 'label'",
        "metrics_title": "រង្វាស់នៃការបណ្តុះបណ្តាល",
        "confusion_matrix": "ម៉ាទ្រីសភាពច្រឡំ",
        "success_msg": "ការបណ្តុះបណ្តាលបានជោគជ័យ!",
        "error_msg": "កំហុសមួយបានកើតឡើងក្នុងអំឡុងពេលបណ្តុះបណ្តាល",
        "welcome_msg": "សូមស្វាគមន៍មកកាន់វេទិកាបណ្តុះបណ្តាល MLOps",
        "data_preview": "មើលទិន្នន័យជាមុន",
        "class_distribution": "ការចែកចាយថ្នាក់"
    }
}


def get_translation(key: str, language: str = "en") -> str:
    """Get translated string for given key and language."""
    if language not in UI_TRANSLATIONS:
        language = "en"
    return UI_TRANSLATIONS[language].get(key, UI_TRANSLATIONS["en"].get(key, key))