Navya-Sree commited on
Commit
e03ff34
·
verified ·
1 Parent(s): 559c862

Create src/data_processing/processor.py

Browse files
Files changed (1) hide show
  1. src/data_processing/processor.py +72 -0
src/data_processing/processor.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simplified data processor for Hugging Face compatibility.
3
+ """
4
+ import pandas as pd
5
+ import numpy as np
6
+ from datetime import datetime, timedelta
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class AdvancedDataProcessor:
12
+ """Advanced data processor for time series forecasting."""
13
+
14
+ def __init__(self, config: dict):
15
+ self.config = config
16
+ self.scalers = {}
17
+ self.feature_columns = []
18
+
19
+ def engineer_features(self, df: pd.DataFrame,
20
+ date_col: str,
21
+ value_col: str) -> pd.DataFrame:
22
+ """Create comprehensive time series features."""
23
+ df = df.copy()
24
+
25
+ # DateTime features
26
+ df['year'] = df[date_col].dt.year
27
+ df['month'] = df[date_col].dt.month
28
+ df['week'] = df[date_col].dt.isocalendar().week
29
+ df['day'] = df[date_col].dt.day
30
+ df['dayofweek'] = df[date_col].dt.dayofweek
31
+ df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
32
+ df['quarter'] = df[date_col].dt.quarter
33
+ df['dayofyear'] = df[date_col].dt.dayofyear
34
+
35
+ # Lag features
36
+ for lag in self.config.get('lags', [1, 7, 30]):
37
+ df[f'lag_{lag}'] = df[value_col].shift(lag)
38
+
39
+ # Rolling statistics
40
+ for window in self.config.get('rolling_windows', [7, 30]):
41
+ df[f'rolling_mean_{window}'] = df[value_col].rolling(window=window).mean()
42
+ df[f'rolling_std_{window}'] = df[value_col].rolling(window=window).std()
43
+
44
+ # Difference features
45
+ for diff in self.config.get('differences', [1, 7]):
46
+ df[f'diff_{diff}'] = df[value_col].diff(diff)
47
+
48
+ # Seasonal features
49
+ df['seasonal_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
50
+ df['seasonal_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)
51
+
52
+ # Handle missing values
53
+ df = df.fillna(method='bfill').fillna(method='ffill')
54
+
55
+ self.feature_columns = [col for col in df.columns if col not in [date_col, value_col]]
56
+
57
+ return df
58
+
59
+ def create_sequences(self, df: pd.DataFrame,
60
+ target_col: str,
61
+ feature_cols: list,
62
+ seq_length: int = 30,
63
+ forecast_horizon: int = 7) -> tuple:
64
+ """Create sequences for deep learning models."""
65
+ X, y = [], []
66
+ data = df[feature_cols + [target_col]].values
67
+
68
+ for i in range(len(data) - seq_length - forecast_horizon + 1):
69
+ X.append(data[i:i+seq_length, :-1]) # Features
70
+ y.append(data[i+seq_length:i+seq_length+forecast_horizon, -1]) # Target
71
+
72
+ return np.array(X), np.array(y)