| """ | |
| Data registration script for the predictive maintenance project. | |
| This script is the analogue of the notebook's `data_register.py`: | |
| - It ensures the Hugging Face dataset repo exists. | |
| - It uploads the raw engine dataset from the local `data/` folder | |
| into the dataset repo so it can be consumed by other stages | |
| (EDA, data preparation, model training) using a consistent source. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| try: | |
| import config | |
| from hf_data_utils import register_raw_engine_data_to_hf | |
| except ImportError as e: | |
| print(f"ERROR: Failed to import modules: {e}", file=sys.stderr) | |
| print(f"Python path: {sys.path}", file=sys.stderr) | |
| sys.exit(1) | |
| def main() -> None: | |
| try: | |
| print(f"PROJECT_ROOT: {config.PROJECT_ROOT}") | |
| print(f"RAW_DATA_FILE: {config.RAW_DATA_FILE}") | |
| print(f"RAW_DATA_FILE exists: {config.RAW_DATA_FILE.exists()}") | |
| print(f"HF_DATASET_REPO: {config.HF_DATASET_REPO}") | |
| print(f"HF_TOKEN is set: {bool(config.HF_TOKEN)}") | |
| if not config.RAW_DATA_FILE.exists(): | |
| raise FileNotFoundError( | |
| f"Expected raw data at {config.RAW_DATA_FILE}, " | |
| "but the file does not exist. Make sure `engine_data.csv` " | |
| "is placed in the `data/` folder." | |
| ) | |
| if not config.HF_TOKEN: | |
| raise ValueError( | |
| "HF_TOKEN is not set. Please set it as an environment variable " | |
| "or in GitHub Secrets." | |
| ) | |
| print(f"Registering raw engine dataset from: {config.RAW_DATA_FILE}") | |
| print(f"Target HF dataset repo: {config.HF_DATASET_REPO}") | |
| register_raw_engine_data_to_hf() | |
| print("✅ Dataset registration to Hugging Face completed.") | |
| except Exception as e: | |
| print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr) | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |