| """ | |
| Utility functions for interacting with the Hugging Face Hub for DATASETS. | |
| These helpers are used to: | |
| - Register the raw engine dataset as a Hugging Face dataset repo. | |
| - Upload processed train/test splits back to the dataset repo. | |
| - Download files from the dataset repo for use in data preparation and modeling. | |
| All functions expect a valid HF token to be available, typically via: | |
| - The HF_TOKEN environment variable, or | |
| - An explicit argument. | |
| """ | |
| from pathlib import Path | |
| from typing import Optional | |
| from huggingface_hub import HfApi, hf_hub_download | |
| import config | |
| def _get_token(explicit_token: Optional[str] = None) -> str: | |
| token = explicit_token or config.HF_TOKEN | |
| if not token: | |
| raise ValueError( | |
| "Hugging Face token is not set. " | |
| "Set HF_TOKEN in the environment or pass token explicitly." | |
| ) | |
| return token | |
| def create_or_get_dataset_repo( | |
| repo_id: str, token: Optional[str] = None, private: bool = False | |
| ) -> None: | |
| """ | |
| Create the dataset repo on Hugging Face Hub if it does not already exist. | |
| """ | |
| token = _get_token(token) | |
| api = HfApi(token=token) | |
| api.create_repo( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| private=private, | |
| exist_ok=True, | |
| ) | |
| def upload_dataset_file( | |
| local_path: Path, | |
| repo_id: Optional[str] = None, | |
| repo_path: Optional[str] = None, | |
| token: Optional[str] = None, | |
| ) -> None: | |
| """ | |
| Upload a single file to the Hugging Face dataset repo. | |
| Parameters | |
| ---------- | |
| local_path : Path | |
| The local file to upload. | |
| repo_id : str, optional | |
| The dataset repo ID (e.g., 'username/engine-maintenance-dataset'). | |
| Defaults to config.HF_DATASET_REPO. | |
| repo_path : str, optional | |
| The path inside the repo (e.g., 'data/train.csv'). Defaults to the | |
| file name if not provided. | |
| token : str, optional | |
| Hugging Face token. Defaults to config.HF_TOKEN. | |
| """ | |
| token = _get_token(token) | |
| repo_id = repo_id or config.HF_DATASET_REPO | |
| repo_path = repo_path or local_path.name | |
| api = HfApi(token=token) | |
| create_or_get_dataset_repo(repo_id=repo_id, token=token) | |
| api.upload_file( | |
| path_or_fileobj=str(local_path), | |
| path_in_repo=repo_path, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| def download_dataset_file( | |
| filename: str, | |
| repo_id: Optional[str] = None, | |
| token: Optional[str] = None, | |
| local_dir: Optional[Path] = None, | |
| ) -> Path: | |
| """ | |
| Download a file from the Hugging Face dataset repo and return its local path. | |
| Parameters | |
| ---------- | |
| filename : str | |
| The filename inside the dataset repo (e.g., 'data/engine_data.csv'). | |
| repo_id : str, optional | |
| The dataset repo ID. Defaults to config.HF_DATASET_REPO. | |
| token : str, optional | |
| Hugging Face token. | |
| local_dir : Path, optional | |
| Directory to place the downloaded file. Defaults to config.DATA_DIR. | |
| """ | |
| token = _get_token(token) | |
| repo_id = repo_id or config.HF_DATASET_REPO | |
| local_dir = local_dir or config.DATA_DIR | |
| local_dir.mkdir(parents=True, exist_ok=True) | |
| downloaded_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| repo_type="dataset", | |
| token=token, | |
| local_dir=str(local_dir), | |
| local_dir_use_symlinks=False, | |
| ) | |
| return Path(downloaded_path) | |
| def register_raw_engine_data_to_hf( | |
| token: Optional[str] = None, | |
| repo_id: Optional[str] = None, | |
| ) -> None: | |
| """ | |
| Convenience function to register the original engine_data.csv | |
| in the dataset repo under 'data/engine_data.csv'. | |
| """ | |
| repo_id = repo_id or config.HF_DATASET_REPO | |
| local_path = config.RAW_DATA_FILE | |
| if not local_path.exists(): | |
| raise FileNotFoundError( | |
| f"Raw data file not found at {local_path}. " | |
| "Ensure engine_data.csv is present in the data/ folder." | |
| ) | |
| upload_dataset_file( | |
| local_path=local_path, | |
| repo_id=repo_id, | |
| repo_path="data/engine_data.csv", | |
| token=token, | |
| ) | |