File size: 3,409 Bytes
f7d11f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import great_expectations as gx
import pandas as pd
from predicting_outcomes_in_heart_failure.config import (
    ASSET_NAME,
    RAW_PATH,
    SOURCE_NAME,
    SUITE_NAME,
)
from util import set_gx, show_results


def run_test():
    suite.add_expectation(
        gx.expectations.ExpectTableColumnCountToEqual(value=len(expected_columns))
    )

    for col in expected_columns:
        suite.add_expectation(gx.expectations.ExpectColumnToExist(column=col))
        suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column=col))

    for col in ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]:
        suite.add_expectation(
            gx.expectations.ExpectColumnValuesToBeOfType(
                column=col, type_="float" if col == "Oldpeak" else "int"
            )
        )

    for col in ["Sex", "ChestPainType", "RestingECG", "ST_Slope"]:
        suite.add_expectation(
            gx.expectations.ExpectColumnValuesToBeOfType(column=col, type_="str")
        )

    for col in ["FastingBS", "HeartDisease"]:
        suite.add_expectation(
            gx.expectations.ExpectColumnValuesToBeInSet(column=col, value_set=[0, 1])
        )

    suite.add_expectation(
        gx.expectations.ExpectColumnValuesToBeInSet(column="Sex", value_set=["M", "F"])
    )

    suite.add_expectation(
        gx.expectations.ExpectColumnValuesToBeInSet(column="ExerciseAngina", value_set=["N", "Y"])
    )

    suite.add_expectation(
        gx.expectations.ExpectColumnValuesToBeInSet(
            column="ST_Slope", value_set=["Flat", "Up", "Down"]
        )
    )

    suite.add_expectation(
        gx.expectations.ExpectColumnValuesToBeInSet(
            column="RestingECG", value_set=["Normal", "LVH", "ST"]
        )
    )

    suite.add_expectation(
        gx.expectations.ExpectColumnValuesToBeInSet(
            column="ChestPainType", value_set=["ASY", "NAP", "ATA", "TA"]
        )
    )

    suite.add_expectation(
        gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=features_columns)
    )

    suite.add_expectation(
        gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=expected_columns)
    )

    suite.add_expectation(
        gx.expectations.ExpectColumnValuesToBeBetween(column="Age", min_value=18)
    )

    context.suites.add_or_update(suite)
    validation_definition = context.validation_definitions.add(
        gx.core.validation_definition.ValidationDefinition(
            name=ASSET_NAME + "_validation_raw",
            data=batch_definition,
            suite=suite,
        )
    )

    checkpoint = context.checkpoints.add(
        gx.checkpoint.checkpoint.Checkpoint(
            name=ASSET_NAME + "_checkpoint_raw",
            validation_definitions=[validation_definition],
        )
    )

    checkpoint_result = checkpoint.run(batch_parameters={"dataframe": df})
    show_results(checkpoint_result)


if __name__ == "__main__":
    df = pd.read_csv(RAW_PATH)
    context, suite, batch_definition = set_gx(
        SOURCE_NAME + "_raw", ASSET_NAME + "_raw", SUITE_NAME + "_raw"
    )

    features_columns = [
        "Age",
        "Sex",
        "ChestPainType",
        "RestingBP",
        "Cholesterol",
        "FastingBS",
        "RestingECG",
        "MaxHR",
        "ExerciseAngina",
        "Oldpeak",
        "ST_Slope",
    ]
    expected_columns = features_columns + ["HeartDisease"]
    run_test()