File size: 1,125 Bytes
b89e6d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""Phase 1 EDA entrypoint: analyse the processed training split.

Usage:
    python scripts/02_run_eda.py
Reads data/processed/train.parquet, writes plots + eda_stats.json to data/eda/.
"""
from __future__ import annotations

import json
import sys
from pathlib import Path

import pandas as pd

sys.path.append(str(Path(__file__).resolve().parents[1]))
from src.config import load_config
from src.eda.analyze import run_eda


def main():
    cfg = load_config()
    print("=" * 60)
    print("PHASE 1: EXPLORATORY DATA ANALYSIS")
    print("=" * 60)

    train_path = Path(cfg.paths.processed_dir) / "train.parquet"
    funnel_path = Path(cfg.paths.processed_dir) / "cleaning_funnel.csv"
    if not train_path.exists():
        sys.exit("train.parquet not found. Run scripts/01_prepare_data.py first.")

    df = pd.read_parquet(train_path)
    funnel = pd.read_csv(funnel_path) if funnel_path.exists() else None

    stats = run_eda(df, cfg, funnel)
    print(json.dumps({k: v for k, v in stats.items() if k != "plots"}, indent=2))
    print(f"\nPlots saved to {cfg.paths.eda_dir}")


if __name__ == "__main__":
    main()