"""Fetch the KCC (Kisan Call Centre) dataset from Kaggle. Requires Kaggle API credentials at ~/.kaggle/kaggle.json. Reuse: pip install kaggle mkdir -p ~/.kaggle cp kaggle.json ~/.kaggle/ # from your Kaggle account settings chmod 600 ~/.kaggle/kaggle.json Usage: python scripts/fetch_kcc.py """ from __future__ import annotations import os import shutil from pathlib import Path ROOT = Path(__file__).resolve().parent.parent RAW_DIR = ROOT / "data" / "raw" DATASET = "sridhargutam/kcc-dataset" def main() -> None: # Import lazily — kaggle is a heavy import that touches credentials. from kaggle import KaggleApi RAW_DIR.mkdir(parents=True, exist_ok=True) api = KaggleApi() api.authenticate() print(f"Downloading {DATASET} -> {RAW_DIR} (this can take a few minutes)…") api.dataset_download_files(DATASET, path=str(RAW_DIR), unzip=True, quiet=False) # The dataset publishes a CSV per-state plus a combined file; flatten naming so # downstream scripts can find them. print("Files in data/raw after download:") for p in sorted(RAW_DIR.iterdir()): size_mb = p.stat().st_size / 1024 / 1024 print(f" {p.name} ({size_mb:.1f} MB)") if __name__ == "__main__": main()