File size: 4,098 Bytes
1aa7fae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Utility functions for interacting with the Hugging Face Hub for DATASETS.

These helpers are used to:
- Register the raw engine dataset as a Hugging Face dataset repo.
- Upload processed train/test splits back to the dataset repo.
- Download files from the dataset repo for use in data preparation and modeling.

All functions expect a valid HF token to be available, typically via:
- The HF_TOKEN environment variable, or
- An explicit argument.
"""

from pathlib import Path
from typing import Optional

from huggingface_hub import HfApi, hf_hub_download

import config


def _get_token(explicit_token: Optional[str] = None) -> str:
    token = explicit_token or config.HF_TOKEN
    if not token:
        raise ValueError(
            "Hugging Face token is not set. "
            "Set HF_TOKEN in the environment or pass token explicitly."
        )
    return token


def create_or_get_dataset_repo(
    repo_id: str, token: Optional[str] = None, private: bool = False
) -> None:
    """
    Create the dataset repo on Hugging Face Hub if it does not already exist.
    """
    token = _get_token(token)
    api = HfApi(token=token)
    api.create_repo(
        repo_id=repo_id,
        repo_type="dataset",
        private=private,
        exist_ok=True,
    )


def upload_dataset_file(
    local_path: Path,
    repo_id: Optional[str] = None,
    repo_path: Optional[str] = None,
    token: Optional[str] = None,
) -> None:
    """
    Upload a single file to the Hugging Face dataset repo.

    Parameters
    ----------
    local_path : Path
        The local file to upload.
    repo_id : str, optional
        The dataset repo ID (e.g., 'username/engine-maintenance-dataset').
        Defaults to config.HF_DATASET_REPO.
    repo_path : str, optional
        The path inside the repo (e.g., 'data/train.csv'). Defaults to the
        file name if not provided.
    token : str, optional
        Hugging Face token. Defaults to config.HF_TOKEN.
    """
    token = _get_token(token)
    repo_id = repo_id or config.HF_DATASET_REPO
    repo_path = repo_path or local_path.name

    api = HfApi(token=token)
    create_or_get_dataset_repo(repo_id=repo_id, token=token)

    api.upload_file(
        path_or_fileobj=str(local_path),
        path_in_repo=repo_path,
        repo_id=repo_id,
        repo_type="dataset",
    )


def download_dataset_file(
    filename: str,
    repo_id: Optional[str] = None,
    token: Optional[str] = None,
    local_dir: Optional[Path] = None,
) -> Path:
    """
    Download a file from the Hugging Face dataset repo and return its local path.

    Parameters
    ----------
    filename : str
        The filename inside the dataset repo (e.g., 'data/engine_data.csv').
    repo_id : str, optional
        The dataset repo ID. Defaults to config.HF_DATASET_REPO.
    token : str, optional
        Hugging Face token.
    local_dir : Path, optional
        Directory to place the downloaded file. Defaults to config.DATA_DIR.
    """
    token = _get_token(token)
    repo_id = repo_id or config.HF_DATASET_REPO
    local_dir = local_dir or config.DATA_DIR
    local_dir.mkdir(parents=True, exist_ok=True)

    downloaded_path = hf_hub_download(
        repo_id=repo_id,
        filename=filename,
        repo_type="dataset",
        token=token,
        local_dir=str(local_dir),
        local_dir_use_symlinks=False,
    )
    return Path(downloaded_path)


def register_raw_engine_data_to_hf(
    token: Optional[str] = None,
    repo_id: Optional[str] = None,
) -> None:
    """
    Convenience function to register the original engine_data.csv
    in the dataset repo under 'data/engine_data.csv'.
    """
    repo_id = repo_id or config.HF_DATASET_REPO
    local_path = config.RAW_DATA_FILE
    if not local_path.exists():
        raise FileNotFoundError(
            f"Raw data file not found at {local_path}. "
            "Ensure engine_data.csv is present in the data/ folder."
        )

    upload_dataset_file(
        local_path=local_path,
        repo_id=repo_id,
        repo_path="data/engine_data.csv",
        token=token,
    )