Spaces:
Sleeping
Sleeping
| """Phase 1 EDA entrypoint: analyse the processed training split. | |
| Usage: | |
| python scripts/02_run_eda.py | |
| Reads data/processed/train.parquet, writes plots + eda_stats.json to data/eda/. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import sys | |
| from pathlib import Path | |
| import pandas as pd | |
| sys.path.append(str(Path(__file__).resolve().parents[1])) | |
| from src.config import load_config | |
| from src.eda.analyze import run_eda | |
| def main(): | |
| cfg = load_config() | |
| print("=" * 60) | |
| print("PHASE 1: EXPLORATORY DATA ANALYSIS") | |
| print("=" * 60) | |
| train_path = Path(cfg.paths.processed_dir) / "train.parquet" | |
| funnel_path = Path(cfg.paths.processed_dir) / "cleaning_funnel.csv" | |
| if not train_path.exists(): | |
| sys.exit("train.parquet not found. Run scripts/01_prepare_data.py first.") | |
| df = pd.read_parquet(train_path) | |
| funnel = pd.read_csv(funnel_path) if funnel_path.exists() else None | |
| stats = run_eda(df, cfg, funnel) | |
| print(json.dumps({k: v for k, v in stats.items() if k != "plots"}, indent=2)) | |
| print(f"\nPlots saved to {cfg.paths.eda_dir}") | |
| if __name__ == "__main__": | |
| main() | |