Spaces:
Sleeping
Sleeping
| """ | |
| This module provides a utility function to download Kaggle datasets or competition data. | |
| The function automatically detects whether it is running in a Google Colab environment, a local Linux/Mac environment, or a Windows environment, and sets up the Kaggle API accordingly. | |
| Requirements: | |
| - Kaggle API installed (`pip install kaggle`) | |
| - Kaggle API key (`kaggle.json`) with appropriate permissions. | |
| Environment Detection: | |
| - Google Colab: Uses `/root/.config/kaggle/kaggle.json`. | |
| - Local Linux/Mac: Uses `~/.kaggle/kaggle.json`. | |
| - Windows: Uses `C:\\Users\\<Username>\\.kaggle\\kaggle.json`. | |
| Functions: | |
| get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str | |
| """ | |
| import os | |
| import zipfile | |
| import sys | |
| import shutil | |
| import platform | |
| def get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str: | |
| """ | |
| Downloads a Kaggle dataset or competition data using the Kaggle API in Google Colab, local Linux/Mac, or Windows environment. | |
| Parameters: | |
| json_path (str): Path to your 'kaggle.json' file. | |
| data_name (str): Kaggle dataset or competition name (e.g., 'paultimothymooney/chest-xray-pneumonia' or 'house-prices-advanced-regression-techniques'). | |
| is_competition (bool): Set to True if downloading competition data. Default is False (for datasets). | |
| output_dir (str): Directory to save and extract the data. Default is 'data'. | |
| Returns: | |
| str: Path to the extracted dataset folder. | |
| Raises: | |
| OSError: If 'kaggle.json' is not found or cannot be copied. | |
| Exception: If there is an error during download or extraction. | |
| Example of Usage: | |
| # For downloading a standard dataset | |
| dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia") | |
| print(f"Dataset is available at: {dataset_path}") | |
| # For downloading competition data | |
| competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True) | |
| print(f"Competition data is available at: {competition_path}") | |
| """ | |
| # Detect environment (Colab, local Linux/Mac, or Windows) | |
| is_colab = "google.colab" in sys.modules | |
| is_windows = platform.system() == "Windows" | |
| # Step 1: Setup Kaggle API credentials | |
| try: | |
| if is_colab: | |
| config_dir = "/root/.config/kaggle" | |
| os.makedirs(config_dir, exist_ok=True) | |
| print("Setting up Kaggle API credentials for Colab environment.") | |
| shutil.copy(json_path, os.path.join(config_dir, "kaggle.json")) | |
| os.chmod(os.path.join(config_dir, "kaggle.json"), 0o600) | |
| else: | |
| # For both local Linux/Mac and Windows, use the home directory | |
| config_dir = os.path.join(os.path.expanduser("~"), ".kaggle") | |
| os.makedirs(config_dir, exist_ok=True) | |
| print("Setting up Kaggle API credentials for local environment.") | |
| kaggle_json_dest = os.path.join(config_dir, "kaggle.json") | |
| if not os.path.exists(kaggle_json_dest): | |
| shutil.copy(json_path, kaggle_json_dest) | |
| if not is_windows: | |
| os.chmod(kaggle_json_dest, 0o600) | |
| except Exception as e: | |
| raise OSError(f"Could not set up Kaggle API credentials: {e}") | |
| # Step 2: Create output directory | |
| dataset_dir = os.path.join(output_dir, data_name.split('/')[-1]) | |
| os.makedirs(dataset_dir, exist_ok=True) | |
| original_dir = os.getcwd() | |
| os.chdir(dataset_dir) | |
| # Step 3: Download the dataset or competition data | |
| try: | |
| if is_competition: | |
| print(f"Downloading competition data: {data_name}") | |
| cmd = f"kaggle competitions download -c {data_name}" | |
| else: | |
| print(f"Downloading dataset: {data_name}") | |
| cmd = f"kaggle datasets download -d {data_name}" | |
| os.system(cmd) | |
| except Exception as e: | |
| print(f"Error during download: {e}") | |
| os.chdir(original_dir) | |
| return None | |
| # Step 4: Unzip all downloaded files | |
| zip_files = [f for f in os.listdir() if f.endswith(".zip")] | |
| if not zip_files: | |
| print("No zip files found. Please check the dataset or competition name.") | |
| os.chdir(original_dir) | |
| return None | |
| for zip_file in zip_files: | |
| try: | |
| with zipfile.ZipFile(zip_file, "r") as zip_ref: | |
| zip_ref.extractall() | |
| print(f"Extracted: {zip_file}") | |
| os.remove(zip_file) | |
| except Exception as e: | |
| print(f"Error extracting {zip_file}: {e}") | |
| # Step 5: Navigate back to the original directory | |
| os.chdir(original_dir) | |
| return dataset_dir | |