""" Configuration module for MLOps platform. Contains all configuration classes and constants. """ from dataclasses import dataclass, field from typing import Dict, List, Optional from enum import Enum class LanguageCode(str, Enum): """Supported language codes.""" ENGLISH = "en" CHINESE = "zh" KHMER = "km" class ClassificationType(str, Enum): """Classification task types.""" BINARY = "binary" MULTICLASS = "multiclass" # Supported languages with display names SUPPORTED_LANGUAGES: Dict[str, Dict[str, str]] = { "en": { "name": "English", "native_name": "English", "description": "English language support with standard NLP preprocessing", "tokenizer_hint": "Uses standard word tokenization" }, "zh": { "name": "Chinese", "native_name": "中文", "description": "Chinese language support with character-level tokenization", "tokenizer_hint": "Uses jieba for word segmentation" }, "km": { "name": "Khmer", "native_name": "ភាសាខ្មែរ", "description": "Khmer language support with specialized tokenization", "tokenizer_hint": "Uses ICU-based tokenization for Khmer script" } } # Model architectures supported with recommendations MODEL_ARCHITECTURES = { "roberta-base": { "name": "RoBERTa Base", "description": "Robust BERT model, excellent for English text classification", "languages": ["en"], "max_length": 512, "recommended_for": "English only, high accuracy needed", "speed": "Medium", "size": "355MB", "best_use": "English binary/multiclass classification" }, "bert-base-multilingual-cased": { "name": "mBERT (Multilingual BERT)", "description": "Supports 104 languages - Good balance of performance and multilingual support", "languages": ["en", "zh", "km"], "max_length": 512, "recommended_for": "Multilingual tasks, balanced performance", "speed": "Medium", "size": "665MB", "best_use": "Multilingual classification, good general-purpose model" }, "xlm-roberta-base": { "name": "XLM-RoBERTa Base", "description": "Best multilingual model - Highest accuracy for Chinese, Khmer, and other languages", "languages": ["en", "zh", "km"], "max_length": 512, "recommended_for": "Best multilingual performance, recommended for Chinese/Khmer", "speed": "Medium-Slow", "size": "1.03GB", "best_use": "When you need the best accuracy across multiple languages" }, "distilbert-base-multilingual-cased": { "name": "DistilBERT Multilingual (Recommended for CPU)", "description": "Lightweight and fast - Perfect for CPU training or quick experiments", "languages": ["en", "zh", "km"], "max_length": 512, "recommended_for": "CPU training, fast experiments, limited resources", "speed": "Fast", "size": "525MB", "best_use": "CPU-only systems, quick prototyping, limited GPU memory" } } # Model selection guide MODEL_SELECTION_GUIDE = { "cpu_training": "distilbert-base-multilingual-cased", "gpu_training_english": "roberta-base", "gpu_training_multilingual": "xlm-roberta-base", "quick_experiment": "distilbert-base-multilingual-cased", "production_english": "roberta-base", "production_multilingual": "xlm-roberta-base" } @dataclass class TrainingConfig: """Configuration for model training.""" # Model settings model_name: str = "bert-base-multilingual-cased" num_labels: int = 2 # Training hyperparameters learning_rate: float = 2e-5 batch_size: int = 16 num_epochs: int = 3 warmup_ratio: float = 0.1 weight_decay: float = 0.01 max_length: int = 256 # Data settings train_split: float = 0.8 validation_split: float = 0.1 test_split: float = 0.1 shuffle_data: bool = True random_seed: int = 42 # Language settings language: str = "en" # Output settings output_dir: str = "trained_models" save_best_model: bool = True logging_steps: int = 10 eval_strategy: str = "epoch" # Performance settings use_fp16: bool = False # Disabled for CPU compatibility gradient_accumulation_steps: int = 1 # Labels configuration label_names: List[str] = field(default_factory=lambda: ["Legitimate", "Phishing"]) def validate(self) -> List[str]: """Validate configuration and return list of warnings/errors.""" issues = [] if self.learning_rate <= 0: issues.append("Learning rate must be positive") if self.batch_size < 1: issues.append("Batch size must be at least 1") if self.num_epochs < 1: issues.append("Number of epochs must be at least 1") if self.train_split + self.validation_split + self.test_split > 1.0: issues.append("Sum of data splits cannot exceed 1.0") if self.language not in SUPPORTED_LANGUAGES: issues.append(f"Unsupported language: {self.language}") return issues def to_dict(self) -> dict: """Convert config to dictionary.""" return { "model_name": self.model_name, "num_labels": self.num_labels, "learning_rate": self.learning_rate, "batch_size": self.batch_size, "num_epochs": self.num_epochs, "warmup_ratio": self.warmup_ratio, "weight_decay": self.weight_decay, "max_length": self.max_length, "train_split": self.train_split, "validation_split": self.validation_split, "test_split": self.test_split, "shuffle_data": self.shuffle_data, "random_seed": self.random_seed, "language": self.language, "output_dir": self.output_dir, "label_names": self.label_names } @dataclass class ExperimentConfig: """Configuration for experiment tracking.""" experiment_name: str = "content_detection" run_name: Optional[str] = None tags: Dict[str, str] = field(default_factory=dict) description: str = "" # MLflow settings (optional) use_mlflow: bool = False mlflow_tracking_uri: str = "mlruns" # UI Translation strings UI_TRANSLATIONS = { "en": { "app_title": "MLOps Training Platform", "sidebar_title": "Configuration", "language_select": "Select Target Language", "upload_data": "Upload Dataset", "training_config": "Training Configuration", "start_training": "Start Training", "training_progress": "Training Progress", "evaluation": "Model Evaluation", "download_model": "Download Model", "upload_help": "Upload a CSV file with 'text' and 'label' columns", "metrics_title": "Training Metrics", "confusion_matrix": "Confusion Matrix", "success_msg": "Training completed successfully!", "error_msg": "An error occurred during training", "welcome_msg": "Welcome to the MLOps Training Platform", "data_preview": "Data Preview", "class_distribution": "Class Distribution" }, "zh": { "app_title": "🤖 机器学习运维训练平台", "sidebar_title": "配置", "language_select": "选择目标语言", "upload_data": "上传数据集", "training_config": "训练配置", "start_training": "开始训练", "training_progress": "训练进度", "evaluation": "模型评估", "download_model": "下载模型", "upload_help": "上传包含 'text' 和 'label' 列的CSV文件", "metrics_title": "训练指标", "confusion_matrix": "混淆矩阵", "success_msg": "训练成功完成!", "error_msg": "训练过程中发生错误", "welcome_msg": "欢迎使用机器学习运维训练平台", "data_preview": "数据预览", "class_distribution": "类别分布" }, "km": { "app_title": "🤖 វេទិកាបណ្តុះបណ្តាល MLOps", "sidebar_title": "ការកំណត់", "language_select": "ជ្រើសរើសភាសាគោលដៅ", "upload_data": "ផ្ទុកឡើងសំណុំទិន្នន័យ", "training_config": "ការកំណត់ការបណ្តុះបណ្តាល", "start_training": "ចាប់ផ្តើមបណ្តុះបណ្តាល", "training_progress": "វឌ្ឍនភាពនៃការបណ្តុះបណ្តាល", "evaluation": "ការវាយតម្លៃម៉ូដែល", "download_model": "ទាញយកម៉ូដែល", "upload_help": "ផ្ទុកឡើងឯកសារ CSV ដែលមានជួរឈរ 'text' និង 'label'", "metrics_title": "រង្វាស់នៃការបណ្តុះបណ្តាល", "confusion_matrix": "ម៉ាទ្រីសភាពច្រឡំ", "success_msg": "ការបណ្តុះបណ្តាលបានជោគជ័យ!", "error_msg": "កំហុសមួយបានកើតឡើងក្នុងអំឡុងពេលបណ្តុះបណ្តាល", "welcome_msg": "សូមស្វាគមន៍មកកាន់វេទិកាបណ្តុះបណ្តាល MLOps", "data_preview": "មើលទិន្នន័យជាមុន", "class_distribution": "ការចែកចាយថ្នាក់" } } def get_translation(key: str, language: str = "en") -> str: """Get translated string for given key and language.""" if language not in UI_TRANSLATIONS: language = "en" return UI_TRANSLATIONS[language].get(key, UI_TRANSLATIONS["en"].get(key, key))