|
|
""" |
|
|
Exploratory Data Analysis (EDA) script for the predictive maintenance project. |
|
|
|
|
|
This script covers: |
|
|
- Data overview (shape, types, missing values, basic statistics) |
|
|
- Univariate analysis (distributions of features, target balance) |
|
|
- Bivariate/multivariate analysis (correlations and pairwise relationships) |
|
|
|
|
|
Figures are saved under `notebooks/figures/` for easy inclusion in reports. |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
|
|
|
import config |
|
|
from data_prep import _clean_data, _load_raw_data_from_hf_or_local |
|
|
|
|
|
|
|
|
FIGURES_DIR = config.PROJECT_ROOT / "notebooks" / "figures" |
|
|
FIGURES_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
def run_eda() -> None: |
|
|
|
|
|
raw_df = _load_raw_data_from_hf_or_local() |
|
|
df = _clean_data(raw_df) |
|
|
|
|
|
print("=== DATA OVERVIEW ===") |
|
|
print(f"Shape: {df.shape}") |
|
|
print("\nData types:") |
|
|
print(df.dtypes) |
|
|
print("\nMissing values per column:") |
|
|
print(df.isna().sum()) |
|
|
print("\nSummary statistics:") |
|
|
print(df.describe()) |
|
|
|
|
|
|
|
|
plt.figure(figsize=(4, 4)) |
|
|
sns.countplot(x=config.TARGET_COLUMN, data=df) |
|
|
plt.title("Engine Condition Distribution") |
|
|
plt.xlabel("Engine Condition (0 = Normal, 1 = Faulty)") |
|
|
plt.ylabel("Count") |
|
|
plt.tight_layout() |
|
|
plt.savefig(FIGURES_DIR / "target_distribution.png") |
|
|
plt.close() |
|
|
|
|
|
|
|
|
df[config.FEATURE_COLUMNS].hist(bins=30, figsize=(12, 8)) |
|
|
plt.suptitle("Feature Distributions", y=1.02) |
|
|
plt.tight_layout() |
|
|
plt.savefig(FIGURES_DIR / "feature_histograms.png") |
|
|
plt.close() |
|
|
|
|
|
|
|
|
plt.figure(figsize=(8, 6)) |
|
|
corr = df[config.FEATURE_COLUMNS + [config.TARGET_COLUMN]].corr() |
|
|
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f") |
|
|
plt.title("Correlation Heatmap") |
|
|
plt.tight_layout() |
|
|
plt.savefig(FIGURES_DIR / "correlation_heatmap.png") |
|
|
plt.close() |
|
|
|
|
|
|
|
|
subset_cols = ["Engine_RPM", "Lub_Oil_Pressure", "Fuel_Pressure", config.TARGET_COLUMN] |
|
|
sns.pairplot( |
|
|
df[subset_cols], |
|
|
hue=config.TARGET_COLUMN, |
|
|
diag_kind="hist", |
|
|
corner=True, |
|
|
) |
|
|
plt.suptitle("Pairwise Relationships (subset of features)", y=1.02) |
|
|
plt.tight_layout() |
|
|
plt.savefig(FIGURES_DIR / "pairplot_subset.png") |
|
|
plt.close() |
|
|
|
|
|
print(f"\nEDA figures saved to: {FIGURES_DIR}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
run_eda() |
|
|
|
|
|
|