Spaces:
Sleeping
Sleeping
| # ============================================ | |
| # ENUMERATION CLASSES | |
| # ============================================ | |
| from dataclasses import asdict, dataclass, field | |
| from enum import Enum | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| from venv import logger | |
| class DataType(Enum): | |
| """Data types""" | |
| NUMERIC = "numeric" | |
| CATEGORICAL = "categorical" | |
| TEMPORAL = "temporal" | |
| TEXT = "text" | |
| class PreprocessingMethod(Enum): | |
| """Data preprocessing methods""" | |
| FILL_MEAN = "fill_mean" | |
| FILL_MEDIAN = "fill_median" | |
| FILL_INTERPOLATE = "fill_interpolate" | |
| FILL_KNN = "fill_knn" | |
| REMOVE = "remove" | |
| CLIP = "clip" | |
| WINSORIZE = "winsorize" | |
| NORMALIZE = "normalize" | |
| STANDARDIZE = "standardize" | |
| LOG_TRANSFORM = "log_transform" | |
| BOX_COX = "box_cox" | |
| DIFFERENCING = "differencing" | |
| class SeasonalityType(Enum): | |
| """Seasonality types""" | |
| DAILY = "daily" | |
| WEEKLY = "weekly" | |
| MONTHLY = "monthly" | |
| QUARTERLY = "quarterly" | |
| YEARLY = "yearly" | |
| MULTIPLE = "multiple" | |
| # ============================================ | |
| # CLASS 1: CONFIGURATION | |
| # ============================================ | |
| class Config: | |
| """Experiment configuration for data preprocessing""" | |
| # Paths and directories | |
| data_path: str = 'temp_data.csv' | |
| results_dir: str = 'data_preprocessing_results' | |
| # Temporal parameters | |
| start_year: int = 1970 | |
| end_year: int = 1990 | |
| freq: str = 'D' # Data frequency: D (daily), H (hourly), M (monthly) | |
| # Target variable | |
| target_column: str = 'raskhodvoda' | |
| # Feature parameters | |
| max_lags: int = 12 | |
| seasonal_period: int = 365 | |
| rolling_windows: List[int] = field(default_factory=lambda: [7, 30, 90, 365]) | |
| expanding_windows: List[int] = field(default_factory=lambda: [30, 90, 365]) | |
| # Processing parameters | |
| missing_threshold: float = 0.3 # Threshold for dropping columns with missing values | |
| outlier_method: str = 'iqr' # Outlier detection method: iqr, zscore, lof | |
| outlier_alpha: float = 1.5 # IQR multiplier | |
| outlier_contamination: float = 0.1 # For methods like LOF | |
| # Data splitting | |
| test_size: float = 0.2 | |
| validation_size: float = 0.1 | |
| split_method: str = 'time' # time, random, expanding_window | |
| # Scaling | |
| scaling_method: str = 'robust' # standard, minmax, robust, none | |
| # Feature selection | |
| feature_selection_method: str = 'correlation' # correlation, mutual_info, rf, pca | |
| max_features: int = 50 | |
| # Validation | |
| enable_validation: bool = True | |
| validation_rules: Dict = field(default_factory=dict) | |
| # Visualisation | |
| save_plots: bool = True | |
| plot_style: str = 'seaborn' | |
| # Performance | |
| use_multiprocessing: bool = False | |
| n_jobs: int = -1 | |
| chunk_size: int = 10000 | |
| # Logging | |
| log_level: str = 'INFO' | |
| save_reports: bool = True | |
| def __post_init__(self): | |
| """Post-initialisation for creating directories and setting up logging""" | |
| self.create_directories() | |
| self.setup_logging() | |
| # Setting default validation rules | |
| if not self.validation_rules: | |
| self.validation_rules = { | |
| 'min_rows': 100, | |
| 'max_missing_percentage': 30, | |
| 'min_unique_values': 2, | |
| 'max_skewness': 3, | |
| 'max_kurtosis': 10 | |
| } | |
| def create_directories(self) -> None: | |
| """Create directories for preprocessing results""" | |
| dirs = [ | |
| self.results_dir, | |
| f'{self.results_dir}/plots', | |
| f'{self.results_dir}/plots/time_series', | |
| f'{self.results_dir}/plots/distributions', | |
| f'{self.results_dir}/plots/correlations', | |
| f'{self.results_dir}/plots/features', | |
| f'{self.results_dir}/tables', | |
| f'{self.results_dir}/processed_data', | |
| f'{self.results_dir}/models', | |
| f'{self.results_dir}/reports', | |
| f'{self.results_dir}/logs', | |
| f'{self.results_dir}/checkpoints' | |
| ] | |
| for directory in dirs: | |
| Path(directory).mkdir(parents=True, exist_ok=True) | |
| logger.info(f"Directories created in {self.results_dir}") | |
| def setup_logging(self) -> None: | |
| """Configure logging""" | |
| log_level = getattr(logging, self.log_level.upper()) | |
| logger.setLevel(log_level) | |
| def to_dict(self) -> Dict: | |
| """Convert configuration to dictionary""" | |
| return asdict(self) | |
| def save(self, path: Optional[str] = None) -> None: | |
| """Save configuration to file""" | |
| if path is None: | |
| path = f'{self.results_dir}/config.json' | |
| with open(path, 'w', encoding='utf-8') as f: | |
| json.dump(self.to_dict(), f, indent=4, ensure_ascii=False) | |
| logger.info(f"Configuration saved to {path}") | |
| def load(cls, path: str) -> 'Config': | |
| """Load configuration from file""" | |
| with open(path, 'r', encoding='utf-8') as f: | |
| config_dict = json.load(f) | |
| return cls(**config_dict) |