"""Fetch the KCC (Kisan Call Centre) dataset from Kaggle.

Requires Kaggle API credentials at ~/.kaggle/kaggle.json. Reuse:
    pip install kaggle
    mkdir -p ~/.kaggle
    cp kaggle.json ~/.kaggle/  # from your Kaggle account settings
    chmod 600 ~/.kaggle/kaggle.json

Usage:
    python scripts/fetch_kcc.py
"""
from __future__ import annotations

import os
import shutil
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
RAW_DIR = ROOT / "data" / "raw"
DATASET = "sridhargutam/kcc-dataset"


def main() -> None:
    # Import lazily — kaggle is a heavy import that touches credentials.
    from kaggle import KaggleApi

    RAW_DIR.mkdir(parents=True, exist_ok=True)
    api = KaggleApi()
    api.authenticate()
    print(f"Downloading {DATASET} -> {RAW_DIR} (this can take a few minutes)…")
    api.dataset_download_files(DATASET, path=str(RAW_DIR), unzip=True, quiet=False)

    # The dataset publishes a CSV per-state plus a combined file; flatten naming so
    # downstream scripts can find them.
    print("Files in data/raw after download:")
    for p in sorted(RAW_DIR.iterdir()):
        size_mb = p.stat().st_size / 1024 / 1024
        print(f"  {p.name}  ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()