Data-Science-Agent / src /tools /enhanced_feature_engineering.py
Pulastya B
fix: Fix module import paths for Render deployment
227cb22
"""
Enhanced Feature Engineering - Additional robust features
"""
import polars as pl
import numpy as np
from typing import Dict, Any, List, Optional
from pathlib import Path
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from ..utils.polars_helpers import load_dataframe, save_dataframe, get_numeric_columns
from ..utils.validation import validate_file_exists, validate_dataframe
def create_ratio_features(file_path: str,
columns: Optional[List[str]] = None,
max_ratios: int = 20,
output_path: Optional[str] = None) -> Dict[str, Any]:
"""
Create ratio features (a/b) for all numeric column pairs.
ROBUST: Handles division by zero, infinity, and NaN values.
Args:
file_path: Path to dataset
columns: Columns to use (None = all numeric)
max_ratios: Maximum number of ratio features
output_path: Output file path
Returns:
Dictionary with results
"""
validate_file_exists(file_path)
df = load_dataframe(file_path)
validate_dataframe(df)
if columns is None:
columns = get_numeric_columns(df)
print(f"πŸ”’ Creating ratio features from {len(columns)} columns...")
ratio_exprs = []
feature_names = []
for i, col1 in enumerate(columns[:15]):
for col2 in columns[i+1:16]:
if len(ratio_exprs) >= max_ratios:
break
# Safe division (avoid div by zero, replace inf/nan)
ratio_name = f"ratio_{col1}_div_{col2}"
ratio_expr = (
pl.when(pl.col(col2).abs() < 1e-10)
.then(0)
.otherwise(pl.col(col1) / pl.col(col2))
.clip(-1e6, 1e6) # Clip extreme values
.fill_nan(0)
.fill_null(0)
.alias(ratio_name)
)
ratio_exprs.append(ratio_expr)
feature_names.append(ratio_name)
df = df.with_columns(ratio_exprs)
if output_path:
save_dataframe(df, output_path)
return {
'success': True,
'tool': 'create_ratio_features',
'result': {
'new_features': len(ratio_exprs),
'feature_names': feature_names,
'output_path': output_path
}
}
def create_statistical_features(file_path: str,
columns: Optional[List[str]] = None,
output_path: Optional[str] = None) -> Dict[str, Any]:
"""
Create row-wise statistical features (mean, std, min, max, range).
ROBUST: Handles missing values and edge cases.
Args:
file_path: Path to dataset
columns: Columns to use (None = all numeric)
output_path: Output file path
Returns:
Dictionary with results
"""
validate_file_exists(file_path)
df = load_dataframe(file_path)
validate_dataframe(df)
if columns is None:
columns = get_numeric_columns(df)
print(f"πŸ“Š Creating statistical features across {len(columns)} columns...")
# Row-wise statistics
stat_features = [
pl.concat_list([pl.col(c) for c in columns]).list.mean().fill_null(0).alias('row_mean'),
pl.concat_list([pl.col(c) for c in columns]).list.std().fill_null(0).alias('row_std'),
pl.concat_list([pl.col(c) for c in columns]).list.min().fill_null(0).alias('row_min'),
pl.concat_list([pl.col(c) for c in columns]).list.max().fill_null(0).alias('row_max'),
(pl.concat_list([pl.col(c) for c in columns]).list.max() -
pl.concat_list([pl.col(c) for c in columns]).list.min()).fill_null(0).alias('row_range'),
pl.concat_list([pl.col(c) for c in columns]).list.sum().fill_null(0).alias('row_sum'),
]
df = df.with_columns(stat_features)
if output_path:
save_dataframe(df, output_path)
return {
'success': True,
'tool': 'create_statistical_features',
'result': {
'new_features': 6,
'feature_names': ['row_mean', 'row_std', 'row_min', 'row_max', 'row_range', 'row_sum'],
'output_path': output_path
}
}
def create_log_features(file_path: str,
columns: Optional[List[str]] = None,
output_path: Optional[str] = None) -> Dict[str, Any]:
"""
Create log-transformed features for skewed distributions.
ROBUST: Handles negative values and zeros.
Args:
file_path: Path to dataset
columns: Columns to use (None = all numeric with positive values)
output_path: Output file path
Returns:
Dictionary with results
"""
validate_file_exists(file_path)
df = load_dataframe(file_path)
validate_dataframe(df)
if columns is None:
columns = get_numeric_columns(df)
print(f"πŸ“ˆ Creating log-transformed features for {len(columns)} columns...")
log_exprs = []
feature_names = []
for col in columns:
# Check if column has positive values
min_val = df[col].min()
if min_val is not None and min_val > 0:
# log(x)
log_exprs.append(pl.col(col).log().fill_nan(0).alias(f"log_{col}"))
feature_names.append(f"log_{col}")
elif min_val is not None and min_val >= 0:
# log(x+1) for non-negative values
log_exprs.append((pl.col(col) + 1).log().fill_nan(0).alias(f"log1p_{col}"))
feature_names.append(f"log1p_{col}")
if log_exprs:
df = df.with_columns(log_exprs)
if output_path:
save_dataframe(df, output_path)
return {
'success': True,
'tool': 'create_log_features',
'result': {
'new_features': len(log_exprs),
'feature_names': feature_names,
'output_path': output_path
}
}
def create_binned_features(file_path: str,
columns: Optional[List[str]] = None,
n_bins: int = 5,
output_path: Optional[str] = None) -> Dict[str, Any]:
"""
Create binned (discretized) features from continuous variables.
ROBUST: Uses quantile-based binning to handle outliers.
Args:
file_path: Path to dataset
columns: Columns to use (None = all numeric)
n_bins: Number of bins
output_path: Output file path
Returns:
Dictionary with results
"""
validate_file_exists(file_path)
df = load_dataframe(file_path)
validate_dataframe(df)
if columns is None:
columns = get_numeric_columns(df)[:10] # Limit to 10 columns
print(f"πŸ—‚οΈ Creating binned features for {len(columns)} columns with {n_bins} bins...")
binned_exprs = []
feature_names = []
for col in columns:
# Quantile-based binning
bin_name = f"{col}_binned"
binned_exprs.append(
pl.col(col).qcut(n_bins, labels=[f"bin_{i}" for i in range(n_bins)],
allow_duplicates=True).fill_null("bin_0").alias(bin_name)
)
feature_names.append(bin_name)
df = df.with_columns(binned_exprs)
if output_path:
save_dataframe(df, output_path)
return {
'success': True,
'tool': 'create_binned_features',
'result': {
'new_features': len(binned_exprs),
'feature_names': feature_names,
'n_bins': n_bins,
'output_path': output_path
}
}