Spaces:
Running
Running
| import pandas as pd | |
| from mlpipeline.entity import DataValidationConfig, DataValidationArtifact | |
| from mlpipeline.logging.logger import get_logger | |
| from mlpipeline.exception import DataValidationException | |
| from mlpipeline.utils.common import read_yaml | |
| import sys | |
| logger = get_logger(__name__) | |
| class DataValidation: | |
| def __init__(self, config: DataValidationConfig): | |
| self.config = config | |
| def validate_schema(self) -> DataValidationArtifact: | |
| try: | |
| logger.info("Starting data validation") | |
| df = pd.read_csv(self.config.data_dir) | |
| validation_status = True | |
| errors = [] | |
| if df.empty: | |
| validation_status = False | |
| errors.append("Dataset is empty") | |
| if df.isnull().all().any(): | |
| validation_status = False | |
| errors.append("Columns with all null values found") | |
| status_message = "Validation passed" if validation_status else "; ".join(errors) | |
| with open(self.config.status_file, "w") as f: | |
| f.write(status_message) | |
| logger.info(f"Validation status: {status_message}") | |
| return DataValidationArtifact( | |
| validation_status=validation_status, | |
| message=status_message, | |
| schema_file_path=self.config.schema_file | |
| ) | |
| except Exception as e: | |
| raise DataValidationException(str(e), sys) |