Spaces:
Sleeping
Sleeping
| """ | |
| This file prepares raw project data folders for the new MLOps pipeline. | |
| It does not train or infer anything. It only checks that expected raw data paths exist | |
| and creates missing folders safely. This keeps data preparation clean and organized. | |
| It is the first step before frame extraction or dataset generation. | |
| """ | |
| from pathlib import Path | |
| from src.entity.config_entity import PathsConfig | |
| from src.utils.common import create_directories | |
| from src.utils.helpers import list_image_files, list_video_files | |
| from src.utils.logger import get_logger | |
| class DataIngestion: | |
| """ | |
| Prepare and validate the raw data area of the project. | |
| """ | |
| def __init__( | |
| self, | |
| paths_config: PathsConfig, | |
| log_dir: Path | None = None, | |
| log_level: str = "INFO", | |
| ) -> None: | |
| self.paths_config = paths_config | |
| self.logger = get_logger( | |
| self.__class__.__name__, log_dir=log_dir, level=log_level | |
| ) | |
| def create_required_directories(self) -> None: | |
| """ | |
| Create all raw/interim/processed folders that the data pipeline needs. | |
| """ | |
| create_directories( | |
| [ | |
| self.paths_config.raw_data_dir, | |
| self.paths_config.interim_data_dir, | |
| self.paths_config.processed_data_dir, | |
| self.paths_config.raw_video_dir, | |
| self.paths_config.raw_image_dir, | |
| self.paths_config.posture_feature_dir, | |
| self.paths_config.hand_crop_dir, | |
| self.paths_config.phone_dataset_dir, | |
| self.paths_config.metrics_dir, | |
| self.paths_config.predictions_dir, | |
| ] | |
| ) | |
| self.logger.info("Required data directories are ready.") | |
| def summarize_available_data(self) -> dict: | |
| """ | |
| Count currently available raw videos and raw images. | |
| """ | |
| videos = list_video_files(self.paths_config.raw_video_dir) | |
| images = list_image_files(self.paths_config.raw_image_dir) | |
| summary = { | |
| "raw_video_count": len(videos), | |
| "raw_image_count": len(images), | |
| "raw_video_dir": str(self.paths_config.raw_video_dir), | |
| "raw_image_dir": str(self.paths_config.raw_image_dir), | |
| } | |
| self.logger.info("Data summary: %s", summary) | |
| return summary | |
| def run(self) -> dict: | |
| """ | |
| Run the ingestion stage. | |
| """ | |
| self.create_required_directories() | |
| return self.summarize_available_data() | |