Spaces:
Sleeping
Sleeping
File size: 3,409 Bytes
f7d11f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import great_expectations as gx
import pandas as pd
from predicting_outcomes_in_heart_failure.config import (
ASSET_NAME,
RAW_PATH,
SOURCE_NAME,
SUITE_NAME,
)
from util import set_gx, show_results
def run_test():
suite.add_expectation(
gx.expectations.ExpectTableColumnCountToEqual(value=len(expected_columns))
)
for col in expected_columns:
suite.add_expectation(gx.expectations.ExpectColumnToExist(column=col))
suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column=col))
for col in ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]:
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeOfType(
column=col, type_="float" if col == "Oldpeak" else "int"
)
)
for col in ["Sex", "ChestPainType", "RestingECG", "ST_Slope"]:
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeOfType(column=col, type_="str")
)
for col in ["FastingBS", "HeartDisease"]:
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeInSet(column=col, value_set=[0, 1])
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeInSet(column="Sex", value_set=["M", "F"])
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeInSet(column="ExerciseAngina", value_set=["N", "Y"])
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeInSet(
column="ST_Slope", value_set=["Flat", "Up", "Down"]
)
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeInSet(
column="RestingECG", value_set=["Normal", "LVH", "ST"]
)
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeInSet(
column="ChestPainType", value_set=["ASY", "NAP", "ATA", "TA"]
)
)
suite.add_expectation(
gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=features_columns)
)
suite.add_expectation(
gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=expected_columns)
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeBetween(column="Age", min_value=18)
)
context.suites.add_or_update(suite)
validation_definition = context.validation_definitions.add(
gx.core.validation_definition.ValidationDefinition(
name=ASSET_NAME + "_validation_raw",
data=batch_definition,
suite=suite,
)
)
checkpoint = context.checkpoints.add(
gx.checkpoint.checkpoint.Checkpoint(
name=ASSET_NAME + "_checkpoint_raw",
validation_definitions=[validation_definition],
)
)
checkpoint_result = checkpoint.run(batch_parameters={"dataframe": df})
show_results(checkpoint_result)
if __name__ == "__main__":
df = pd.read_csv(RAW_PATH)
context, suite, batch_definition = set_gx(
SOURCE_NAME + "_raw", ASSET_NAME + "_raw", SUITE_NAME + "_raw"
)
features_columns = [
"Age",
"Sex",
"ChestPainType",
"RestingBP",
"Cholesterol",
"FastingBS",
"RestingECG",
"MaxHR",
"ExerciseAngina",
"Oldpeak",
"ST_Slope",
]
expected_columns = features_columns + ["HeartDisease"]
run_test()
|