Spaces:
Sleeping
Sleeping
| """ | |
| Configuration module for MLOps platform. | |
| Contains all configuration classes and constants. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import Dict, List, Optional | |
| from enum import Enum | |
| class LanguageCode(str, Enum): | |
| """Supported language codes.""" | |
| ENGLISH = "en" | |
| CHINESE = "zh" | |
| KHMER = "km" | |
| class ClassificationType(str, Enum): | |
| """Classification task types.""" | |
| BINARY = "binary" | |
| MULTICLASS = "multiclass" | |
| # Supported languages with display names | |
| SUPPORTED_LANGUAGES: Dict[str, Dict[str, str]] = { | |
| "en": { | |
| "name": "English", | |
| "native_name": "English", | |
| "description": "English language support with standard NLP preprocessing", | |
| "tokenizer_hint": "Uses standard word tokenization" | |
| }, | |
| "zh": { | |
| "name": "Chinese", | |
| "native_name": "ไธญๆ", | |
| "description": "Chinese language support with character-level tokenization", | |
| "tokenizer_hint": "Uses jieba for word segmentation" | |
| }, | |
| "km": { | |
| "name": "Khmer", | |
| "native_name": "แแถแแถแแแแแ", | |
| "description": "Khmer language support with specialized tokenization", | |
| "tokenizer_hint": "Uses ICU-based tokenization for Khmer script" | |
| } | |
| } | |
| # Model architectures supported with recommendations | |
| MODEL_ARCHITECTURES = { | |
| "roberta-base": { | |
| "name": "RoBERTa Base", | |
| "description": "Robust BERT model, excellent for English text classification", | |
| "languages": ["en"], | |
| "max_length": 512, | |
| "recommended_for": "English only, high accuracy needed", | |
| "speed": "Medium", | |
| "size": "355MB", | |
| "best_use": "English binary/multiclass classification" | |
| }, | |
| "bert-base-multilingual-cased": { | |
| "name": "mBERT (Multilingual BERT)", | |
| "description": "Supports 104 languages - Good balance of performance and multilingual support", | |
| "languages": ["en", "zh", "km"], | |
| "max_length": 512, | |
| "recommended_for": "Multilingual tasks, balanced performance", | |
| "speed": "Medium", | |
| "size": "665MB", | |
| "best_use": "Multilingual classification, good general-purpose model" | |
| }, | |
| "xlm-roberta-base": { | |
| "name": "XLM-RoBERTa Base", | |
| "description": "Best multilingual model - Highest accuracy for Chinese, Khmer, and other languages", | |
| "languages": ["en", "zh", "km"], | |
| "max_length": 512, | |
| "recommended_for": "Best multilingual performance, recommended for Chinese/Khmer", | |
| "speed": "Medium-Slow", | |
| "size": "1.03GB", | |
| "best_use": "When you need the best accuracy across multiple languages" | |
| }, | |
| "distilbert-base-multilingual-cased": { | |
| "name": "DistilBERT Multilingual (Recommended for CPU)", | |
| "description": "Lightweight and fast - Perfect for CPU training or quick experiments", | |
| "languages": ["en", "zh", "km"], | |
| "max_length": 512, | |
| "recommended_for": "CPU training, fast experiments, limited resources", | |
| "speed": "Fast", | |
| "size": "525MB", | |
| "best_use": "CPU-only systems, quick prototyping, limited GPU memory" | |
| } | |
| } | |
| # Model selection guide | |
| MODEL_SELECTION_GUIDE = { | |
| "cpu_training": "distilbert-base-multilingual-cased", | |
| "gpu_training_english": "roberta-base", | |
| "gpu_training_multilingual": "xlm-roberta-base", | |
| "quick_experiment": "distilbert-base-multilingual-cased", | |
| "production_english": "roberta-base", | |
| "production_multilingual": "xlm-roberta-base" | |
| } | |
| class TrainingConfig: | |
| """Configuration for model training.""" | |
| # Model settings | |
| model_name: str = "bert-base-multilingual-cased" | |
| num_labels: int = 2 | |
| # Training hyperparameters | |
| learning_rate: float = 2e-5 | |
| batch_size: int = 16 | |
| num_epochs: int = 3 | |
| warmup_ratio: float = 0.1 | |
| weight_decay: float = 0.01 | |
| max_length: int = 256 | |
| # Data settings | |
| train_split: float = 0.8 | |
| validation_split: float = 0.1 | |
| test_split: float = 0.1 | |
| shuffle_data: bool = True | |
| random_seed: int = 42 | |
| # Language settings | |
| language: str = "en" | |
| # Output settings | |
| output_dir: str = "trained_models" | |
| save_best_model: bool = True | |
| logging_steps: int = 10 | |
| eval_strategy: str = "epoch" | |
| # Performance settings | |
| use_fp16: bool = False # Disabled for CPU compatibility | |
| gradient_accumulation_steps: int = 1 | |
| # Labels configuration | |
| label_names: List[str] = field(default_factory=lambda: ["Legitimate", "Phishing"]) | |
| def validate(self) -> List[str]: | |
| """Validate configuration and return list of warnings/errors.""" | |
| issues = [] | |
| if self.learning_rate <= 0: | |
| issues.append("Learning rate must be positive") | |
| if self.batch_size < 1: | |
| issues.append("Batch size must be at least 1") | |
| if self.num_epochs < 1: | |
| issues.append("Number of epochs must be at least 1") | |
| if self.train_split + self.validation_split + self.test_split > 1.0: | |
| issues.append("Sum of data splits cannot exceed 1.0") | |
| if self.language not in SUPPORTED_LANGUAGES: | |
| issues.append(f"Unsupported language: {self.language}") | |
| return issues | |
| def to_dict(self) -> dict: | |
| """Convert config to dictionary.""" | |
| return { | |
| "model_name": self.model_name, | |
| "num_labels": self.num_labels, | |
| "learning_rate": self.learning_rate, | |
| "batch_size": self.batch_size, | |
| "num_epochs": self.num_epochs, | |
| "warmup_ratio": self.warmup_ratio, | |
| "weight_decay": self.weight_decay, | |
| "max_length": self.max_length, | |
| "train_split": self.train_split, | |
| "validation_split": self.validation_split, | |
| "test_split": self.test_split, | |
| "shuffle_data": self.shuffle_data, | |
| "random_seed": self.random_seed, | |
| "language": self.language, | |
| "output_dir": self.output_dir, | |
| "label_names": self.label_names | |
| } | |
| class ExperimentConfig: | |
| """Configuration for experiment tracking.""" | |
| experiment_name: str = "content_detection" | |
| run_name: Optional[str] = None | |
| tags: Dict[str, str] = field(default_factory=dict) | |
| description: str = "" | |
| # MLflow settings (optional) | |
| use_mlflow: bool = False | |
| mlflow_tracking_uri: str = "mlruns" | |
| # UI Translation strings | |
| UI_TRANSLATIONS = { | |
| "en": { | |
| "app_title": "MLOps Training Platform", | |
| "sidebar_title": "Configuration", | |
| "language_select": "Select Target Language", | |
| "upload_data": "Upload Dataset", | |
| "training_config": "Training Configuration", | |
| "start_training": "Start Training", | |
| "training_progress": "Training Progress", | |
| "evaluation": "Model Evaluation", | |
| "download_model": "Download Model", | |
| "upload_help": "Upload a CSV file with 'text' and 'label' columns", | |
| "metrics_title": "Training Metrics", | |
| "confusion_matrix": "Confusion Matrix", | |
| "success_msg": "Training completed successfully!", | |
| "error_msg": "An error occurred during training", | |
| "welcome_msg": "Welcome to the MLOps Training Platform", | |
| "data_preview": "Data Preview", | |
| "class_distribution": "Class Distribution" | |
| }, | |
| "zh": { | |
| "app_title": "๐ค ๆบๅจๅญฆไน ่ฟ็ปด่ฎญ็ปๅนณๅฐ", | |
| "sidebar_title": "้ ็ฝฎ", | |
| "language_select": "้ๆฉ็ฎๆ ่ฏญ่จ", | |
| "upload_data": "ไธไผ ๆฐๆฎ้", | |
| "training_config": "่ฎญ็ป้ ็ฝฎ", | |
| "start_training": "ๅผๅง่ฎญ็ป", | |
| "training_progress": "่ฎญ็ป่ฟๅบฆ", | |
| "evaluation": "ๆจกๅ่ฏไผฐ", | |
| "download_model": "ไธ่ฝฝๆจกๅ", | |
| "upload_help": "ไธไผ ๅ ๅซ 'text' ๅ 'label' ๅ็CSVๆไปถ", | |
| "metrics_title": "่ฎญ็ปๆๆ ", | |
| "confusion_matrix": "ๆททๆท็ฉ้ต", | |
| "success_msg": "่ฎญ็ปๆๅๅฎๆ๏ผ", | |
| "error_msg": "่ฎญ็ป่ฟ็จไธญๅ็้่ฏฏ", | |
| "welcome_msg": "ๆฌข่ฟไฝฟ็จๆบๅจๅญฆไน ่ฟ็ปด่ฎญ็ปๅนณๅฐ", | |
| "data_preview": "ๆฐๆฎ้ข่ง", | |
| "class_distribution": "็ฑปๅซๅๅธ" | |
| }, | |
| "km": { | |
| "app_title": "๐ค แแแแทแแถแแแแแปแแแแแแถแ MLOps", | |
| "sidebar_title": "แแถแแแแแแ", | |
| "language_select": "แแแแพแแแพแแแถแแถแแแแแ ", | |
| "upload_data": "แแแแปแแกแพแแแแแปแแแทแแแแแแ", | |
| "training_config": "แแถแแแแแแแแถแแแแแแปแแแแแแถแ", | |
| "start_training": "แ แถแแแแแแพแแแแแแปแแแแแแถแ", | |
| "training_progress": "แแแแแแแถแแแแแถแแแแแแปแแแแแแถแ", | |
| "evaluation": "แแถแแแถแแแแแแแแแผแแแ", | |
| "download_model": "แแถแแแแแแผแแแ", | |
| "upload_help": "แแแแปแแกแพแแฏแแแถแ CSV แแแแแถแแแฝแแแ 'text' แแทแ 'label'", | |
| "metrics_title": "แแแแแถแแแแแแถแแแแแแปแแแแแแถแ", | |
| "confusion_matrix": "แแแถแแแแธแแแถแแ แแแกแ", | |
| "success_msg": "แแถแแแแแแปแแแแแแถแแแถแแแแแแแ!", | |
| "error_msg": "แแแ แปแแแฝแแแถแแแพแแกแพแแแแแปแแขแแกแปแแแแแแแแแปแแแแแแถแ", | |
| "welcome_msg": "แแผแแแแแถแแแแแแแแถแแแแแแทแแถแแแแแปแแแแแแถแ MLOps", | |
| "data_preview": "แแพแแแทแแแแแแแแถแแปแ", | |
| "class_distribution": "แแถแแ แแแ แถแแแแแถแแ" | |
| } | |
| } | |
| def get_translation(key: str, language: str = "en") -> str: | |
| """Get translated string for given key and language.""" | |
| if language not in UI_TRANSLATIONS: | |
| language = "en" | |
| return UI_TRANSLATIONS[language].get(key, UI_TRANSLATIONS["en"].get(key, key)) | |