code-gen-assistant / scripts /02_run_eda.py
Rushabh147's picture
Initial deploy to HF Spaces (clean history, LFS for all binaries)
b89e6d6
Raw
History Blame Contribute Delete
1.13 kB
"""Phase 1 EDA entrypoint: analyse the processed training split.
Usage:
python scripts/02_run_eda.py
Reads data/processed/train.parquet, writes plots + eda_stats.json to data/eda/.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
import pandas as pd
sys.path.append(str(Path(__file__).resolve().parents[1]))
from src.config import load_config
from src.eda.analyze import run_eda
def main():
cfg = load_config()
print("=" * 60)
print("PHASE 1: EXPLORATORY DATA ANALYSIS")
print("=" * 60)
train_path = Path(cfg.paths.processed_dir) / "train.parquet"
funnel_path = Path(cfg.paths.processed_dir) / "cleaning_funnel.csv"
if not train_path.exists():
sys.exit("train.parquet not found. Run scripts/01_prepare_data.py first.")
df = pd.read_parquet(train_path)
funnel = pd.read_csv(funnel_path) if funnel_path.exists() else None
stats = run_eda(df, cfg, funnel)
print(json.dumps({k: v for k, v in stats.items() if k != "plots"}, indent=2))
print(f"\nPlots saved to {cfg.paths.eda_dir}")
if __name__ == "__main__":
main()