Navya-Sree's picture
Create src/data_processing/processor.py
e03ff34 verified
"""
Simplified data processor for Hugging Face compatibility.
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
logger = logging.getLogger(__name__)
class AdvancedDataProcessor:
"""Advanced data processor for time series forecasting."""
def __init__(self, config: dict):
self.config = config
self.scalers = {}
self.feature_columns = []
def engineer_features(self, df: pd.DataFrame,
date_col: str,
value_col: str) -> pd.DataFrame:
"""Create comprehensive time series features."""
df = df.copy()
# DateTime features
df['year'] = df[date_col].dt.year
df['month'] = df[date_col].dt.month
df['week'] = df[date_col].dt.isocalendar().week
df['day'] = df[date_col].dt.day
df['dayofweek'] = df[date_col].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
df['quarter'] = df[date_col].dt.quarter
df['dayofyear'] = df[date_col].dt.dayofyear
# Lag features
for lag in self.config.get('lags', [1, 7, 30]):
df[f'lag_{lag}'] = df[value_col].shift(lag)
# Rolling statistics
for window in self.config.get('rolling_windows', [7, 30]):
df[f'rolling_mean_{window}'] = df[value_col].rolling(window=window).mean()
df[f'rolling_std_{window}'] = df[value_col].rolling(window=window).std()
# Difference features
for diff in self.config.get('differences', [1, 7]):
df[f'diff_{diff}'] = df[value_col].diff(diff)
# Seasonal features
df['seasonal_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
df['seasonal_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)
# Handle missing values
df = df.fillna(method='bfill').fillna(method='ffill')
self.feature_columns = [col for col in df.columns if col not in [date_col, value_col]]
return df
def create_sequences(self, df: pd.DataFrame,
target_col: str,
feature_cols: list,
seq_length: int = 30,
forecast_horizon: int = 7) -> tuple:
"""Create sequences for deep learning models."""
X, y = [], []
data = df[feature_cols + [target_col]].values
for i in range(len(data) - seq_length - forecast_horizon + 1):
X.append(data[i:i+seq_length, :-1]) # Features
y.append(data[i+seq_length:i+seq_length+forecast_horizon, -1]) # Target
return np.array(X), np.array(y)