amul-ai-eval / scripts /fetch_kcc.py
bpHigh's picture
HF Space: add charts tab
74e6b83
Raw
History Blame Contribute Delete
1.26 kB
"""Fetch the KCC (Kisan Call Centre) dataset from Kaggle.
Requires Kaggle API credentials at ~/.kaggle/kaggle.json. Reuse:
pip install kaggle
mkdir -p ~/.kaggle
cp kaggle.json ~/.kaggle/ # from your Kaggle account settings
chmod 600 ~/.kaggle/kaggle.json
Usage:
python scripts/fetch_kcc.py
"""
from __future__ import annotations
import os
import shutil
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
RAW_DIR = ROOT / "data" / "raw"
DATASET = "sridhargutam/kcc-dataset"
def main() -> None:
# Import lazily — kaggle is a heavy import that touches credentials.
from kaggle import KaggleApi
RAW_DIR.mkdir(parents=True, exist_ok=True)
api = KaggleApi()
api.authenticate()
print(f"Downloading {DATASET} -> {RAW_DIR} (this can take a few minutes)…")
api.dataset_download_files(DATASET, path=str(RAW_DIR), unzip=True, quiet=False)
# The dataset publishes a CSV per-state plus a combined file; flatten naming so
# downstream scripts can find them.
print("Files in data/raw after download:")
for p in sorted(RAW_DIR.iterdir()):
size_mb = p.stat().st_size / 1024 / 1024
print(f" {p.name} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()