Spaces:
Running
Running
File size: 1,553 Bytes
a7d80f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | import pandas as pd
from mlpipeline.entity import DataValidationConfig, DataValidationArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import DataValidationException
from mlpipeline.utils.common import read_yaml
import sys
logger = get_logger(__name__)
class DataValidation:
def __init__(self, config: DataValidationConfig):
self.config = config
def validate_schema(self) -> DataValidationArtifact:
try:
logger.info("Starting data validation")
df = pd.read_csv(self.config.data_dir)
validation_status = True
errors = []
if df.empty:
validation_status = False
errors.append("Dataset is empty")
if df.isnull().all().any():
validation_status = False
errors.append("Columns with all null values found")
status_message = "Validation passed" if validation_status else "; ".join(errors)
with open(self.config.status_file, "w") as f:
f.write(status_message)
logger.info(f"Validation status: {status_message}")
return DataValidationArtifact(
validation_status=validation_status,
message=status_message,
schema_file_path=self.config.schema_file
)
except Exception as e:
raise DataValidationException(str(e), sys) |