"""Phase 1 entrypoint: load -> clean -> split -> save processed parquet files. Usage: python scripts/01_prepare_data.py Outputs train/val/test parquet files + a cleaning_funnel.csv into data/processed/. """ from __future__ import annotations import sys from pathlib import Path sys.path.append(str(Path(__file__).resolve().parents[1])) from src.config import load_config from src.data.clean import clean, split from src.data.load import load_raw def main(): cfg = load_config() print("=" * 60) print("PHASE 1: DATA PREPARATION") print("=" * 60) raw = load_raw(cfg) cleaned, funnel = clean(raw, cfg) print("\nCleaning funnel:") print(funnel.to_string(index=False)) splits = split(cleaned, cfg) out = Path(cfg.paths.processed_dir) for name, part in splits.items(): path = out / f"{name}.parquet" part.to_parquet(path, index=False) print(f" saved {name}: {len(part):,} rows -> {path}") funnel.to_csv(out / "cleaning_funnel.csv", index=False) print(f"\nDone. Processed data in {out}") if __name__ == "__main__": main()