File size: 1,553 Bytes
a7d80f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pandas as pd
from mlpipeline.entity import DataValidationConfig, DataValidationArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import DataValidationException
from mlpipeline.utils.common import read_yaml
import sys

logger = get_logger(__name__)


class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_schema(self) -> DataValidationArtifact:
        try:
            logger.info("Starting data validation")
            
            df = pd.read_csv(self.config.data_dir)
            
            validation_status = True
            errors = []
            
            if df.empty:
                validation_status = False
                errors.append("Dataset is empty")
            
            if df.isnull().all().any():
                validation_status = False
                errors.append("Columns with all null values found")
            
            status_message = "Validation passed" if validation_status else "; ".join(errors)
            
            with open(self.config.status_file, "w") as f:
                f.write(status_message)
            
            logger.info(f"Validation status: {status_message}")
            
            return DataValidationArtifact(
                validation_status=validation_status,
                message=status_message,
                schema_file_path=self.config.schema_file
            )
        except Exception as e:
            raise DataValidationException(str(e), sys)