nyayasetu / preprocessing /download.py
CaffeinatedCoding's picture
Upload folder using huggingface_hub
0214972 verified
"""
Download Indian Supreme Court judgments from Kaggle.
Uses kagglehub to download directly - no manual zip extraction needed.
Output: data/raw_judgments.jsonl
WHY kagglehub? Programmatic download - reproducible, no manual steps.
Anyone cloning this repo can run this script and get the same data.
"""
import kagglehub
import json
import os
import glob
def download_judgments():
print("Downloading SC Judgments dataset from Kaggle...")
# Downloads to a local cache folder, returns the path
path = kagglehub.dataset_download("adarshsingh0903/legal-dataset-sc-judgments-india-19502024")
print(f"Dataset downloaded to: {path}")
# See what files we got
all_files = []
for root, dirs, files in os.walk(path):
for file in files:
full_path = os.path.join(root, file)
all_files.append(full_path)
print(f" Found: {full_path}")
print(f"\nTotal files found: {len(all_files)}")
return path, all_files
if __name__ == "__main__":
path, files = download_judgments()