Spaces:
Sleeping
Sleeping
| """Fetch the KCC (Kisan Call Centre) dataset from Kaggle. | |
| Requires Kaggle API credentials at ~/.kaggle/kaggle.json. Reuse: | |
| pip install kaggle | |
| mkdir -p ~/.kaggle | |
| cp kaggle.json ~/.kaggle/ # from your Kaggle account settings | |
| chmod 600 ~/.kaggle/kaggle.json | |
| Usage: | |
| python scripts/fetch_kcc.py | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import shutil | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent.parent | |
| RAW_DIR = ROOT / "data" / "raw" | |
| DATASET = "sridhargutam/kcc-dataset" | |
| def main() -> None: | |
| # Import lazily — kaggle is a heavy import that touches credentials. | |
| from kaggle import KaggleApi | |
| RAW_DIR.mkdir(parents=True, exist_ok=True) | |
| api = KaggleApi() | |
| api.authenticate() | |
| print(f"Downloading {DATASET} -> {RAW_DIR} (this can take a few minutes)…") | |
| api.dataset_download_files(DATASET, path=str(RAW_DIR), unzip=True, quiet=False) | |
| # The dataset publishes a CSV per-state plus a combined file; flatten naming so | |
| # downstream scripts can find them. | |
| print("Files in data/raw after download:") | |
| for p in sorted(RAW_DIR.iterdir()): | |
| size_mb = p.stat().st_size / 1024 / 1024 | |
| print(f" {p.name} ({size_mb:.1f} MB)") | |
| if __name__ == "__main__": | |
| main() |