Spaces:
Sleeping
Sleeping
| """Phase 1 entrypoint: load -> clean -> split -> save processed parquet files. | |
| Usage: | |
| python scripts/01_prepare_data.py | |
| Outputs train/val/test parquet files + a cleaning_funnel.csv into data/processed/. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| sys.path.append(str(Path(__file__).resolve().parents[1])) | |
| from src.config import load_config | |
| from src.data.clean import clean, split | |
| from src.data.load import load_raw | |
| def main(): | |
| cfg = load_config() | |
| print("=" * 60) | |
| print("PHASE 1: DATA PREPARATION") | |
| print("=" * 60) | |
| raw = load_raw(cfg) | |
| cleaned, funnel = clean(raw, cfg) | |
| print("\nCleaning funnel:") | |
| print(funnel.to_string(index=False)) | |
| splits = split(cleaned, cfg) | |
| out = Path(cfg.paths.processed_dir) | |
| for name, part in splits.items(): | |
| path = out / f"{name}.parquet" | |
| part.to_parquet(path, index=False) | |
| print(f" saved {name}: {len(part):,} rows -> {path}") | |
| funnel.to_csv(out / "cleaning_funnel.csv", index=False) | |
| print(f"\nDone. Processed data in {out}") | |
| if __name__ == "__main__": | |
| main() | |