engine-maintenance-space / src /hf_data_utils.py
ananttripathiak's picture
Upload folder using huggingface_hub
1aa7fae verified
"""
Utility functions for interacting with the Hugging Face Hub for DATASETS.
These helpers are used to:
- Register the raw engine dataset as a Hugging Face dataset repo.
- Upload processed train/test splits back to the dataset repo.
- Download files from the dataset repo for use in data preparation and modeling.
All functions expect a valid HF token to be available, typically via:
- The HF_TOKEN environment variable, or
- An explicit argument.
"""
from pathlib import Path
from typing import Optional
from huggingface_hub import HfApi, hf_hub_download
import config
def _get_token(explicit_token: Optional[str] = None) -> str:
token = explicit_token or config.HF_TOKEN
if not token:
raise ValueError(
"Hugging Face token is not set. "
"Set HF_TOKEN in the environment or pass token explicitly."
)
return token
def create_or_get_dataset_repo(
repo_id: str, token: Optional[str] = None, private: bool = False
) -> None:
"""
Create the dataset repo on Hugging Face Hub if it does not already exist.
"""
token = _get_token(token)
api = HfApi(token=token)
api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=private,
exist_ok=True,
)
def upload_dataset_file(
local_path: Path,
repo_id: Optional[str] = None,
repo_path: Optional[str] = None,
token: Optional[str] = None,
) -> None:
"""
Upload a single file to the Hugging Face dataset repo.
Parameters
----------
local_path : Path
The local file to upload.
repo_id : str, optional
The dataset repo ID (e.g., 'username/engine-maintenance-dataset').
Defaults to config.HF_DATASET_REPO.
repo_path : str, optional
The path inside the repo (e.g., 'data/train.csv'). Defaults to the
file name if not provided.
token : str, optional
Hugging Face token. Defaults to config.HF_TOKEN.
"""
token = _get_token(token)
repo_id = repo_id or config.HF_DATASET_REPO
repo_path = repo_path or local_path.name
api = HfApi(token=token)
create_or_get_dataset_repo(repo_id=repo_id, token=token)
api.upload_file(
path_or_fileobj=str(local_path),
path_in_repo=repo_path,
repo_id=repo_id,
repo_type="dataset",
)
def download_dataset_file(
filename: str,
repo_id: Optional[str] = None,
token: Optional[str] = None,
local_dir: Optional[Path] = None,
) -> Path:
"""
Download a file from the Hugging Face dataset repo and return its local path.
Parameters
----------
filename : str
The filename inside the dataset repo (e.g., 'data/engine_data.csv').
repo_id : str, optional
The dataset repo ID. Defaults to config.HF_DATASET_REPO.
token : str, optional
Hugging Face token.
local_dir : Path, optional
Directory to place the downloaded file. Defaults to config.DATA_DIR.
"""
token = _get_token(token)
repo_id = repo_id or config.HF_DATASET_REPO
local_dir = local_dir or config.DATA_DIR
local_dir.mkdir(parents=True, exist_ok=True)
downloaded_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
repo_type="dataset",
token=token,
local_dir=str(local_dir),
local_dir_use_symlinks=False,
)
return Path(downloaded_path)
def register_raw_engine_data_to_hf(
token: Optional[str] = None,
repo_id: Optional[str] = None,
) -> None:
"""
Convenience function to register the original engine_data.csv
in the dataset repo under 'data/engine_data.csv'.
"""
repo_id = repo_id or config.HF_DATASET_REPO
local_path = config.RAW_DATA_FILE
if not local_path.exists():
raise FileNotFoundError(
f"Raw data file not found at {local_path}. "
"Ensure engine_data.csv is present in the data/ folder."
)
upload_dataset_file(
local_path=local_path,
repo_id=repo_id,
repo_path="data/engine_data.csv",
token=token,
)