File size: 2,003 Bytes
1aa7fae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
"""
Data registration script for the predictive maintenance project.
This script is the analogue of the notebook's `data_register.py`:
- It ensures the Hugging Face dataset repo exists.
- It uploads the raw engine dataset from the local `data/` folder
into the dataset repo so it can be consumed by other stages
(EDA, data preparation, model training) using a consistent source.
"""
from __future__ import annotations
import sys
from pathlib import Path
try:
import config
from hf_data_utils import register_raw_engine_data_to_hf
except ImportError as e:
print(f"ERROR: Failed to import modules: {e}", file=sys.stderr)
print(f"Python path: {sys.path}", file=sys.stderr)
sys.exit(1)
def main() -> None:
try:
print(f"PROJECT_ROOT: {config.PROJECT_ROOT}")
print(f"RAW_DATA_FILE: {config.RAW_DATA_FILE}")
print(f"RAW_DATA_FILE exists: {config.RAW_DATA_FILE.exists()}")
print(f"HF_DATASET_REPO: {config.HF_DATASET_REPO}")
print(f"HF_TOKEN is set: {bool(config.HF_TOKEN)}")
if not config.RAW_DATA_FILE.exists():
raise FileNotFoundError(
f"Expected raw data at {config.RAW_DATA_FILE}, "
"but the file does not exist. Make sure `engine_data.csv` "
"is placed in the `data/` folder."
)
if not config.HF_TOKEN:
raise ValueError(
"HF_TOKEN is not set. Please set it as an environment variable "
"or in GitHub Secrets."
)
print(f"Registering raw engine dataset from: {config.RAW_DATA_FILE}")
print(f"Target HF dataset repo: {config.HF_DATASET_REPO}")
register_raw_engine_data_to_hf()
print("✅ Dataset registration to Hugging Face completed.")
except Exception as e:
print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()
|