File size: 2,613 Bytes
1aa7fae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Exploratory Data Analysis (EDA) script for the predictive maintenance project.

This script covers:
- Data overview (shape, types, missing values, basic statistics)
- Univariate analysis (distributions of features, target balance)
- Bivariate/multivariate analysis (correlations and pairwise relationships)

Figures are saved under `notebooks/figures/` for easy inclusion in reports.
"""

from __future__ import annotations

from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

import config
from data_prep import _clean_data, _load_raw_data_from_hf_or_local


FIGURES_DIR = config.PROJECT_ROOT / "notebooks" / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)


def run_eda() -> None:
    # Load and clean data using the same logic as the pipeline
    raw_df = _load_raw_data_from_hf_or_local()
    df = _clean_data(raw_df)

    print("=== DATA OVERVIEW ===")
    print(f"Shape: {df.shape}")
    print("\nData types:")
    print(df.dtypes)
    print("\nMissing values per column:")
    print(df.isna().sum())
    print("\nSummary statistics:")
    print(df.describe())

    # Univariate analysis: target distribution
    plt.figure(figsize=(4, 4))
    sns.countplot(x=config.TARGET_COLUMN, data=df)
    plt.title("Engine Condition Distribution")
    plt.xlabel("Engine Condition (0 = Normal, 1 = Faulty)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / "target_distribution.png")
    plt.close()

    # Univariate analysis: histograms for features
    df[config.FEATURE_COLUMNS].hist(bins=30, figsize=(12, 8))
    plt.suptitle("Feature Distributions", y=1.02)
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / "feature_histograms.png")
    plt.close()

    # Correlation heatmap (multivariate)
    plt.figure(figsize=(8, 6))
    corr = df[config.FEATURE_COLUMNS + [config.TARGET_COLUMN]].corr()
    sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / "correlation_heatmap.png")
    plt.close()

    # Pairplot for a subset of features (bivariate relationships)
    subset_cols = ["Engine_RPM", "Lub_Oil_Pressure", "Fuel_Pressure", config.TARGET_COLUMN]
    sns.pairplot(
        df[subset_cols],
        hue=config.TARGET_COLUMN,
        diag_kind="hist",
        corner=True,
    )
    plt.suptitle("Pairwise Relationships (subset of features)", y=1.02)
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / "pairplot_subset.png")
    plt.close()

    print(f"\nEDA figures saved to: {FIGURES_DIR}")


if __name__ == "__main__":
    run_eda()