File size: 2,743 Bytes
e03ff34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
Simplified data processor for Hugging Face compatibility.
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging

logger = logging.getLogger(__name__)

class AdvancedDataProcessor:
    """Advanced data processor for time series forecasting."""
    
    def __init__(self, config: dict):
        self.config = config
        self.scalers = {}
        self.feature_columns = []
        
    def engineer_features(self, df: pd.DataFrame, 
                         date_col: str, 
                         value_col: str) -> pd.DataFrame:
        """Create comprehensive time series features."""
        df = df.copy()
        
        # DateTime features
        df['year'] = df[date_col].dt.year
        df['month'] = df[date_col].dt.month
        df['week'] = df[date_col].dt.isocalendar().week
        df['day'] = df[date_col].dt.day
        df['dayofweek'] = df[date_col].dt.dayofweek
        df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
        df['quarter'] = df[date_col].dt.quarter
        df['dayofyear'] = df[date_col].dt.dayofyear
        
        # Lag features
        for lag in self.config.get('lags', [1, 7, 30]):
            df[f'lag_{lag}'] = df[value_col].shift(lag)
        
        # Rolling statistics
        for window in self.config.get('rolling_windows', [7, 30]):
            df[f'rolling_mean_{window}'] = df[value_col].rolling(window=window).mean()
            df[f'rolling_std_{window}'] = df[value_col].rolling(window=window).std()
        
        # Difference features
        for diff in self.config.get('differences', [1, 7]):
            df[f'diff_{diff}'] = df[value_col].diff(diff)
        
        # Seasonal features
        df['seasonal_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
        df['seasonal_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)
        
        # Handle missing values
        df = df.fillna(method='bfill').fillna(method='ffill')
        
        self.feature_columns = [col for col in df.columns if col not in [date_col, value_col]]
        
        return df
    
    def create_sequences(self, df: pd.DataFrame, 
                       target_col: str, 
                       feature_cols: list,
                       seq_length: int = 30,
                       forecast_horizon: int = 7) -> tuple:
        """Create sequences for deep learning models."""
        X, y = [], []
        data = df[feature_cols + [target_col]].values
        
        for i in range(len(data) - seq_length - forecast_horizon + 1):
            X.append(data[i:i+seq_length, :-1])  # Features
            y.append(data[i+seq_length:i+seq_length+forecast_horizon, -1])  # Target
        
        return np.array(X), np.array(y)