Spaces:

songhieng
/

MLOps-Platforms

Sleeping

App Files Files Community

songhieng commited on Jan 18

Commit

168a930

verified ·

1 Parent(s): 917be01

Update src/mlops/config.py

Browse files

Files changed (1) hide show

src/mlops/config.py +259 -259

src/mlops/config.py CHANGED Viewed

@@ -1,259 +1,259 @@
-"""
-Configuration module for MLOps platform.
-Contains all configuration classes and constants.
-"""
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-from enum import Enum
-class LanguageCode(str, Enum):
-    """Supported language codes."""
-    ENGLISH = "en"
-    CHINESE = "zh"
-    KHMER = "km"
-class ClassificationType(str, Enum):
-    """Classification task types."""
-    BINARY = "binary"
-    MULTICLASS = "multiclass"
-# Supported languages with display names
-SUPPORTED_LANGUAGES: Dict[str, Dict[str, str]] = {
-    "en": {
-        "name": "English",
-        "native_name": "English",
-        "description": "English language support with standard NLP preprocessing",
-        "tokenizer_hint": "Uses standard word tokenization"
-    },
-    "zh": {
-        "name": "Chinese",
-        "native_name": "中文",
-        "description": "Chinese language support with character-level tokenization",
-        "tokenizer_hint": "Uses jieba for word segmentation"
-    },
-    "km": {
-        "name": "Khmer",
-        "native_name": "ភាសាខ្មែរ",
-        "description": "Khmer language support with specialized tokenization",
-        "tokenizer_hint": "Uses ICU-based tokenization for Khmer script"
-    }
-}
-# Model architectures supported with recommendations
-MODEL_ARCHITECTURES = {
-    "roberta-base": {
-        "name": "RoBERTa Base",
-        "description": "Robust BERT model, excellent for English text classification",
-        "languages": ["en"],
-        "max_length": 512,
-        "recommended_for": "English only, high accuracy needed",
-        "speed": "Medium",
-        "size": "355MB",
-        "best_use": "English binary/multiclass classification"
-    },
-    "bert-base-multilingual-cased": {
-        "name": "mBERT (Multilingual BERT)",
-        "description": "Supports 104 languages - Good balance of performance and multilingual support",
-        "languages": ["en", "zh", "km"],
-        "max_length": 512,
-        "recommended_for": "Multilingual tasks, balanced performance",
-        "speed": "Medium",
-        "size": "665MB",
-        "best_use": "Multilingual classification, good general-purpose model"
-    },
-    "xlm-roberta-base": {
-        "name": "XLM-RoBERTa Base",
-        "description": "Best multilingual model - Highest accuracy for Chinese, Khmer, and other languages",
-        "languages": ["en", "zh", "km"],
-        "max_length": 512,
-        "recommended_for": "Best multilingual performance, recommended for Chinese/Khmer",
-        "speed": "Medium-Slow",
-        "size": "1.03GB",
-        "best_use": "When you need the best accuracy across multiple languages"
-    },
-    "distilbert-base-multilingual-cased": {
-        "name": "DistilBERT Multilingual (Recommended for CPU)",
-        "description": "Lightweight and fast - Perfect for CPU training or quick experiments",
-        "languages": ["en", "zh", "km"],
-        "max_length": 512,
-        "recommended_for": "CPU training, fast experiments, limited resources",
-        "speed": "Fast",
-        "size": "525MB",
-        "best_use": "CPU-only systems, quick prototyping, limited GPU memory"
-    }
-}
-# Model selection guide
-MODEL_SELECTION_GUIDE = {
-    "cpu_training": "distilbert-base-multilingual-cased",
-    "gpu_training_english": "roberta-base",
-    "gpu_training_multilingual": "xlm-roberta-base",
-    "quick_experiment": "distilbert-base-multilingual-cased",
-    "production_english": "roberta-base",
-    "production_multilingual": "xlm-roberta-base"
-}
-@dataclass
-class TrainingConfig:
-    """Configuration for model training."""
-    # Model settings
-    model_name: str = "bert-base-multilingual-cased"
-    num_labels: int = 2
-    # Training hyperparameters
-    learning_rate: float = 2e-5
-    batch_size: int = 16
-    num_epochs: int = 3
-    warmup_ratio: float = 0.1
-    weight_decay: float = 0.01
-    max_length: int = 256
-    # Data settings
-    train_split: float = 0.8
-    validation_split: float = 0.1
-    test_split: float = 0.1
-    shuffle_data: bool = True
-    random_seed: int = 42
-    # Language settings
-    language: str = "en"
-    # Output settings
-    output_dir: str = "trained_models"
-    save_best_model: bool = True
-    logging_steps: int = 10
-    evaluation_strategy: str = "epoch"
-    # Performance settings
-    use_fp16: bool = False  # Disabled for CPU compatibility
-    gradient_accumulation_steps: int = 1
-    # Labels configuration
-    label_names: List[str] = field(default_factory=lambda: ["Legitimate", "Phishing"])
-    def validate(self) -> List[str]:
-        """Validate configuration and return list of warnings/errors."""
-        issues = []
-        if self.learning_rate <= 0:
-            issues.append("Learning rate must be positive")
-        if self.batch_size < 1:
-            issues.append("Batch size must be at least 1")
-        if self.num_epochs < 1:
-            issues.append("Number of epochs must be at least 1")
-        if self.train_split + self.validation_split + self.test_split > 1.0:
-            issues.append("Sum of data splits cannot exceed 1.0")
-        if self.language not in SUPPORTED_LANGUAGES:
-            issues.append(f"Unsupported language: {self.language}")
-        return issues
-    def to_dict(self) -> dict:
-        """Convert config to dictionary."""
-        return {
-            "model_name": self.model_name,
-            "num_labels": self.num_labels,
-            "learning_rate": self.learning_rate,
-            "batch_size": self.batch_size,
-            "num_epochs": self.num_epochs,
-            "warmup_ratio": self.warmup_ratio,
-            "weight_decay": self.weight_decay,
-            "max_length": self.max_length,
-            "train_split": self.train_split,
-            "validation_split": self.validation_split,
-            "test_split": self.test_split,
-            "shuffle_data": self.shuffle_data,
-            "random_seed": self.random_seed,
-            "language": self.language,
-            "output_dir": self.output_dir,
-            "label_names": self.label_names
-        }
-@dataclass
-class ExperimentConfig:
-    """Configuration for experiment tracking."""
-    experiment_name: str = "content_detection"
-    run_name: Optional[str] = None
-    tags: Dict[str, str] = field(default_factory=dict)
-    description: str = ""
-    # MLflow settings (optional)
-    use_mlflow: bool = False
-    mlflow_tracking_uri: str = "mlruns"
-# UI Translation strings
-UI_TRANSLATIONS = {
-    "en": {
-        "app_title": "MLOps Training Platform",
-        "sidebar_title": "Configuration",
-        "language_select": "Select Target Language",
-        "upload_data": "Upload Dataset",
-        "training_config": "Training Configuration",
-        "start_training": "Start Training",
-        "training_progress": "Training Progress",
-        "evaluation": "Model Evaluation",
-        "download_model": "Download Model",
-        "upload_help": "Upload a CSV file with 'text' and 'label' columns",
-        "metrics_title": "Training Metrics",
-        "confusion_matrix": "Confusion Matrix",
-        "success_msg": "Training completed successfully!",
-        "error_msg": "An error occurred during training",
-        "welcome_msg": "Welcome to the MLOps Training Platform",
-        "data_preview": "Data Preview",
-        "class_distribution": "Class Distribution"
-    },
-    "zh": {
-        "app_title": "🤖 机器学习运维训练平台",
-        "sidebar_title": "配置",
-        "language_select": "选择目标语言",
-        "upload_data": "上传数据集",
-        "training_config": "训练配置",
-        "start_training": "开始训练",
-        "training_progress": "训练进度",
-        "evaluation": "模型评估",
-        "download_model": "下载模型",
-        "upload_help": "上传包含 'text' 和 'label' 列的CSV文件",
-        "metrics_title": "训练指标",
-        "confusion_matrix": "混淆矩阵",
-        "success_msg": "训练成功完成！",
-        "error_msg": "训练过程中发生错误",
-        "welcome_msg": "欢迎使用机器学习运维训练平台",
-        "data_preview": "数据预览",
-        "class_distribution": "类别分布"
-    },
-    "km": {
-        "app_title": "🤖 វេទិកាបណ្តុះបណ្តាល MLOps",
-        "sidebar_title": "ការកំណត់",
-        "language_select": "ជ្រើសរើសភាសាគោលដៅ",
-        "upload_data": "ផ្ទុកឡើងសំណុំទិន្នន័យ",
-        "training_config": "ការកំណត់ការបណ្តុះបណ្តាល",
-        "start_training": "ចាប់ផ្តើមបណ្តុះបណ្តាល",
-        "training_progress": "វឌ្ឍនភាពនៃការបណ្តុះបណ្តាល",
-        "evaluation": "ការវាយតម្លៃម៉ូដែល",
-        "download_model": "ទាញយកម៉ូដែល",
-        "upload_help": "ផ្ទុកឡើងឯកសារ CSV ដែលមានជួរឈរ 'text' និង 'label'",
-        "metrics_title": "រង្វាស់នៃការបណ្តុះបណ្តាល",
-        "confusion_matrix": "ម៉ាទ្រីសភាពច្រឡំ",
-        "success_msg": "ការបណ្តុះបណ្តាលបានជោគជ័យ!",
-        "error_msg": "កំហុសមួយបានកើតឡើងក្នុងអំឡុងពេលបណ្តុះបណ្តាល",
-        "welcome_msg": "សូមស្វាគមន៍មកកាន់វេទិកាបណ្តុះបណ្តាល MLOps",
-        "data_preview": "មើលទិន្នន��យជាមុន",
-        "class_distribution": "ការចែកចាយថ្នាក់"
-    }
-}
-def get_translation(key: str, language: str = "en") -> str:
-    """Get translated string for given key and language."""
-    if language not in UI_TRANSLATIONS:
-        language = "en"
-    return UI_TRANSLATIONS[language].get(key, UI_TRANSLATIONS["en"].get(key, key))

+"""
+Configuration module for MLOps platform.
+Contains all configuration classes and constants.
+"""
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+from enum import Enum
+class LanguageCode(str, Enum):
+    """Supported language codes."""
+    ENGLISH = "en"
+    CHINESE = "zh"
+    KHMER = "km"
+class ClassificationType(str, Enum):
+    """Classification task types."""
+    BINARY = "binary"
+    MULTICLASS = "multiclass"
+# Supported languages with display names
+SUPPORTED_LANGUAGES: Dict[str, Dict[str, str]] = {
+    "en": {
+        "name": "English",
+        "native_name": "English",
+        "description": "English language support with standard NLP preprocessing",
+        "tokenizer_hint": "Uses standard word tokenization"
+    },
+    "zh": {
+        "name": "Chinese",
+        "native_name": "中文",
+        "description": "Chinese language support with character-level tokenization",
+        "tokenizer_hint": "Uses jieba for word segmentation"
+    },
+    "km": {
+        "name": "Khmer",
+        "native_name": "ភាសាខ្មែរ",
+        "description": "Khmer language support with specialized tokenization",
+        "tokenizer_hint": "Uses ICU-based tokenization for Khmer script"
+    }
+}
+# Model architectures supported with recommendations
+MODEL_ARCHITECTURES = {
+    "roberta-base": {
+        "name": "RoBERTa Base",
+        "description": "Robust BERT model, excellent for English text classification",
+        "languages": ["en"],
+        "max_length": 512,
+        "recommended_for": "English only, high accuracy needed",
+        "speed": "Medium",
+        "size": "355MB",
+        "best_use": "English binary/multiclass classification"
+    },
+    "bert-base-multilingual-cased": {
+        "name": "mBERT (Multilingual BERT)",
+        "description": "Supports 104 languages - Good balance of performance and multilingual support",
+        "languages": ["en", "zh", "km"],
+        "max_length": 512,
+        "recommended_for": "Multilingual tasks, balanced performance",
+        "speed": "Medium",
+        "size": "665MB",
+        "best_use": "Multilingual classification, good general-purpose model"
+    },
+    "xlm-roberta-base": {
+        "name": "XLM-RoBERTa Base",
+        "description": "Best multilingual model - Highest accuracy for Chinese, Khmer, and other languages",
+        "languages": ["en", "zh", "km"],
+        "max_length": 512,
+        "recommended_for": "Best multilingual performance, recommended for Chinese/Khmer",
+        "speed": "Medium-Slow",
+        "size": "1.03GB",
+        "best_use": "When you need the best accuracy across multiple languages"
+    },
+    "distilbert-base-multilingual-cased": {
+        "name": "DistilBERT Multilingual (Recommended for CPU)",
+        "description": "Lightweight and fast - Perfect for CPU training or quick experiments",
+        "languages": ["en", "zh", "km"],
+        "max_length": 512,
+        "recommended_for": "CPU training, fast experiments, limited resources",
+        "speed": "Fast",
+        "size": "525MB",
+        "best_use": "CPU-only systems, quick prototyping, limited GPU memory"
+    }
+}
+# Model selection guide
+MODEL_SELECTION_GUIDE = {
+    "cpu_training": "distilbert-base-multilingual-cased",
+    "gpu_training_english": "roberta-base",
+    "gpu_training_multilingual": "xlm-roberta-base",
+    "quick_experiment": "distilbert-base-multilingual-cased",
+    "production_english": "roberta-base",
+    "production_multilingual": "xlm-roberta-base"
+}
+@dataclass
+class TrainingConfig:
+    """Configuration for model training."""
+    # Model settings
+    model_name: str = "bert-base-multilingual-cased"
+    num_labels: int = 2
+    # Training hyperparameters
+    learning_rate: float = 2e-5
+    batch_size: int = 16
+    num_epochs: int = 3
+    warmup_ratio: float = 0.1
+    weight_decay: float = 0.01
+    max_length: int = 256
+    # Data settings
+    train_split: float = 0.8
+    validation_split: float = 0.1
+    test_split: float = 0.1
+    shuffle_data: bool = True
+    random_seed: int = 42
+    # Language settings
+    language: str = "en"
+    # Output settings
+    output_dir: str = "trained_models"
+    save_best_model: bool = True
+    logging_steps: int = 10
+    eval_strategy: str = "epoch"
+    # Performance settings
+    use_fp16: bool = False  # Disabled for CPU compatibility
+    gradient_accumulation_steps: int = 1
+    # Labels configuration
+    label_names: List[str] = field(default_factory=lambda: ["Legitimate", "Phishing"])
+    def validate(self) -> List[str]:
+        """Validate configuration and return list of warnings/errors."""
+        issues = []
+        if self.learning_rate <= 0:
+            issues.append("Learning rate must be positive")
+        if self.batch_size < 1:
+            issues.append("Batch size must be at least 1")
+        if self.num_epochs < 1:
+            issues.append("Number of epochs must be at least 1")
+        if self.train_split + self.validation_split + self.test_split > 1.0:
+            issues.append("Sum of data splits cannot exceed 1.0")
+        if self.language not in SUPPORTED_LANGUAGES:
+            issues.append(f"Unsupported language: {self.language}")
+        return issues
+    def to_dict(self) -> dict:
+        """Convert config to dictionary."""
+        return {
+            "model_name": self.model_name,
+            "num_labels": self.num_labels,
+            "learning_rate": self.learning_rate,
+            "batch_size": self.batch_size,
+            "num_epochs": self.num_epochs,
+            "warmup_ratio": self.warmup_ratio,
+            "weight_decay": self.weight_decay,
+            "max_length": self.max_length,
+            "train_split": self.train_split,
+            "validation_split": self.validation_split,
+            "test_split": self.test_split,
+            "shuffle_data": self.shuffle_data,
+            "random_seed": self.random_seed,
+            "language": self.language,
+            "output_dir": self.output_dir,
+            "label_names": self.label_names
+        }
+@dataclass
+class ExperimentConfig:
+    """Configuration for experiment tracking."""
+    experiment_name: str = "content_detection"
+    run_name: Optional[str] = None
+    tags: Dict[str, str] = field(default_factory=dict)
+    description: str = ""
+    # MLflow settings (optional)
+    use_mlflow: bool = False
+    mlflow_tracking_uri: str = "mlruns"
+# UI Translation strings
+UI_TRANSLATIONS = {
+    "en": {
+        "app_title": "MLOps Training Platform",
+        "sidebar_title": "Configuration",
+        "language_select": "Select Target Language",
+        "upload_data": "Upload Dataset",
+        "training_config": "Training Configuration",
+        "start_training": "Start Training",
+        "training_progress": "Training Progress",
+        "evaluation": "Model Evaluation",
+        "download_model": "Download Model",
+        "upload_help": "Upload a CSV file with 'text' and 'label' columns",
+        "metrics_title": "Training Metrics",
+        "confusion_matrix": "Confusion Matrix",
+        "success_msg": "Training completed successfully!",
+        "error_msg": "An error occurred during training",
+        "welcome_msg": "Welcome to the MLOps Training Platform",
+        "data_preview": "Data Preview",
+        "class_distribution": "Class Distribution"
+    },
+    "zh": {
+        "app_title": "🤖 机器学习运维训练平台",
+        "sidebar_title": "配置",
+        "language_select": "选择目标语言",
+        "upload_data": "上传数据集",
+        "training_config": "训练配置",
+        "start_training": "开始训练",
+        "training_progress": "训练进度",
+        "evaluation": "模型评估",
+        "download_model": "下载模型",
+        "upload_help": "上传包含 'text' 和 'label' 列的CSV文件",
+        "metrics_title": "训练指标",
+        "confusion_matrix": "混淆矩阵",
+        "success_msg": "训练成功完成！",
+        "error_msg": "训练过程中发生错误",
+        "welcome_msg": "欢迎使用机器学习运维训练平台",
+        "data_preview": "数据预览",
+        "class_distribution": "类别分布"
+    },
+    "km": {
+        "app_title": "🤖 វេទិកាបណ្តុះបណ្តាល MLOps",
+        "sidebar_title": "ការកំណត់",
+        "language_select": "ជ្រើសរើសភាសាគោលដៅ",
+        "upload_data": "ផ្ទុកឡើងសំណុំទិន្នន័យ",
+        "training_config": "ការកំណត់ការបណ្តុះបណ្តាល",
+        "start_training": "ចាប់ផ្តើមបណ្តុះបណ្តាល",
+        "training_progress": "វឌ្ឍនភាពនៃការបណ្តុះបណ្តាល",
+        "evaluation": "ការវាយតម្លៃម៉ូដែល",
+        "download_model": "ទាញយកម៉ូដែល",
+        "upload_help": "ផ្ទុកឡើងឯកសារ CSV ដែលមានជួរឈរ 'text' និង 'label'",
+        "metrics_title": "រង្វាស់នៃការបណ្តុះបណ្តាល",
+        "confusion_matrix": "ម៉ាទ្រីសភាពច្រឡំ",
+        "success_msg": "ការបណ្តុះបណ្តាលបានជោគជ័យ!",
+        "error_msg": "កំហុសមួយបានកើតឡើងក្នុងអំឡុងពេលបណ្តុះបណ្តាល",
+        "welcome_msg": "សូមស្វាគមន៍មកកាន់វេទិកាបណ្តុះបណ្តាល MLOps",
+        "data_preview": "មើលទិន្នន័យជាមុន",
+        "class_distribution": "ការចែកចាយថ្នាក់"
+    }
+}
+def get_translation(key: str, language: str = "en") -> str:
+    """Get translated string for given key and language."""
+    if language not in UI_TRANSLATIONS:
+        language = "en"
+    return UI_TRANSLATIONS[language].get(key, UI_TRANSLATIONS["en"].get(key, key))