Spaces:
Sleeping
Sleeping
File size: 1,125 Bytes
b89e6d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | """Phase 1 EDA entrypoint: analyse the processed training split.
Usage:
python scripts/02_run_eda.py
Reads data/processed/train.parquet, writes plots + eda_stats.json to data/eda/.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
import pandas as pd
sys.path.append(str(Path(__file__).resolve().parents[1]))
from src.config import load_config
from src.eda.analyze import run_eda
def main():
cfg = load_config()
print("=" * 60)
print("PHASE 1: EXPLORATORY DATA ANALYSIS")
print("=" * 60)
train_path = Path(cfg.paths.processed_dir) / "train.parquet"
funnel_path = Path(cfg.paths.processed_dir) / "cleaning_funnel.csv"
if not train_path.exists():
sys.exit("train.parquet not found. Run scripts/01_prepare_data.py first.")
df = pd.read_parquet(train_path)
funnel = pd.read_csv(funnel_path) if funnel_path.exists() else None
stats = run_eda(df, cfg, funnel)
print(json.dumps({k: v for k, v in stats.items() if k != "plots"}, indent=2))
print(f"\nPlots saved to {cfg.paths.eda_dir}")
if __name__ == "__main__":
main()
|