TimeFlowPro / config /config.py
ArabovMK's picture
Update all files
d8f69a9
# ============================================
# ENUMERATION CLASSES
# ============================================
from dataclasses import asdict, dataclass, field
from enum import Enum
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional
from venv import logger
class DataType(Enum):
"""Data types"""
NUMERIC = "numeric"
CATEGORICAL = "categorical"
TEMPORAL = "temporal"
TEXT = "text"
class PreprocessingMethod(Enum):
"""Data preprocessing methods"""
FILL_MEAN = "fill_mean"
FILL_MEDIAN = "fill_median"
FILL_INTERPOLATE = "fill_interpolate"
FILL_KNN = "fill_knn"
REMOVE = "remove"
CLIP = "clip"
WINSORIZE = "winsorize"
NORMALIZE = "normalize"
STANDARDIZE = "standardize"
LOG_TRANSFORM = "log_transform"
BOX_COX = "box_cox"
DIFFERENCING = "differencing"
class SeasonalityType(Enum):
"""Seasonality types"""
DAILY = "daily"
WEEKLY = "weekly"
MONTHLY = "monthly"
QUARTERLY = "quarterly"
YEARLY = "yearly"
MULTIPLE = "multiple"
# ============================================
# CLASS 1: CONFIGURATION
# ============================================
@dataclass
class Config:
"""Experiment configuration for data preprocessing"""
# Paths and directories
data_path: str = 'temp_data.csv'
results_dir: str = 'data_preprocessing_results'
# Temporal parameters
start_year: int = 1970
end_year: int = 1990
freq: str = 'D' # Data frequency: D (daily), H (hourly), M (monthly)
# Target variable
target_column: str = 'raskhodvoda'
# Feature parameters
max_lags: int = 12
seasonal_period: int = 365
rolling_windows: List[int] = field(default_factory=lambda: [7, 30, 90, 365])
expanding_windows: List[int] = field(default_factory=lambda: [30, 90, 365])
# Processing parameters
missing_threshold: float = 0.3 # Threshold for dropping columns with missing values
outlier_method: str = 'iqr' # Outlier detection method: iqr, zscore, lof
outlier_alpha: float = 1.5 # IQR multiplier
outlier_contamination: float = 0.1 # For methods like LOF
# Data splitting
test_size: float = 0.2
validation_size: float = 0.1
split_method: str = 'time' # time, random, expanding_window
# Scaling
scaling_method: str = 'robust' # standard, minmax, robust, none
# Feature selection
feature_selection_method: str = 'correlation' # correlation, mutual_info, rf, pca
max_features: int = 50
# Validation
enable_validation: bool = True
validation_rules: Dict = field(default_factory=dict)
# Visualisation
save_plots: bool = True
plot_style: str = 'seaborn'
# Performance
use_multiprocessing: bool = False
n_jobs: int = -1
chunk_size: int = 10000
# Logging
log_level: str = 'INFO'
save_reports: bool = True
def __post_init__(self):
"""Post-initialisation for creating directories and setting up logging"""
self.create_directories()
self.setup_logging()
# Setting default validation rules
if not self.validation_rules:
self.validation_rules = {
'min_rows': 100,
'max_missing_percentage': 30,
'min_unique_values': 2,
'max_skewness': 3,
'max_kurtosis': 10
}
def create_directories(self) -> None:
"""Create directories for preprocessing results"""
dirs = [
self.results_dir,
f'{self.results_dir}/plots',
f'{self.results_dir}/plots/time_series',
f'{self.results_dir}/plots/distributions',
f'{self.results_dir}/plots/correlations',
f'{self.results_dir}/plots/features',
f'{self.results_dir}/tables',
f'{self.results_dir}/processed_data',
f'{self.results_dir}/models',
f'{self.results_dir}/reports',
f'{self.results_dir}/logs',
f'{self.results_dir}/checkpoints'
]
for directory in dirs:
Path(directory).mkdir(parents=True, exist_ok=True)
logger.info(f"Directories created in {self.results_dir}")
def setup_logging(self) -> None:
"""Configure logging"""
log_level = getattr(logging, self.log_level.upper())
logger.setLevel(log_level)
def to_dict(self) -> Dict:
"""Convert configuration to dictionary"""
return asdict(self)
def save(self, path: Optional[str] = None) -> None:
"""Save configuration to file"""
if path is None:
path = f'{self.results_dir}/config.json'
with open(path, 'w', encoding='utf-8') as f:
json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
logger.info(f"Configuration saved to {path}")
@classmethod
def load(cls, path: str) -> 'Config':
"""Load configuration from file"""
with open(path, 'r', encoding='utf-8') as f:
config_dict = json.load(f)
return cls(**config_dict)