Spaces:
Sleeping
Sleeping
| import great_expectations as gx | |
| import pandas as pd | |
| from predicting_outcomes_in_heart_failure.config import ( | |
| ASSET_NAME, | |
| PREPROCESSED_CSV, | |
| SOURCE_NAME, | |
| SUITE_NAME, | |
| ) | |
| from util import set_gx, show_results | |
| def run_test(): | |
| suite.add_expectation( | |
| gx.expectations.ExpectTableColumnCountToEqual(value=len(expected_columns)) | |
| ) | |
| for col in expected_columns: | |
| suite.add_expectation(gx.expectations.ExpectColumnToExist(column=col)) | |
| suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column=col)) | |
| for col in ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]: | |
| suite.add_expectation( | |
| gx.expectations.ExpectColumnValuesToBeOfType(column=col, type_="float") | |
| ) | |
| suite.add_expectation( | |
| gx.expectations.ExpectColumnMeanToBeBetween(column=col, min_value=-0.1, max_value=0.1) | |
| ) | |
| suite.add_expectation( | |
| gx.expectations.ExpectColumnStdevToBeBetween(column=col, min_value=0.9, max_value=1.1) | |
| ) | |
| for col in ["Sex", "FastingBS", "ExerciseAngina", "HeartDisease"]: | |
| suite.add_expectation( | |
| gx.expectations.ExpectColumnValuesToBeInSet(column=col, value_set=[0, 1]) | |
| ) | |
| for col in [ | |
| "ChestPainType_ASY", | |
| "ChestPainType_ATA", | |
| "ChestPainType_NAP", | |
| "ChestPainType_TA", | |
| "RestingECG_LVH", | |
| "RestingECG_Normal", | |
| "RestingECG_ST", | |
| "ST_Slope_Down", | |
| "ST_Slope_Flat", | |
| "ST_Slope_Up", | |
| ]: | |
| suite.add_expectation( | |
| gx.expectations.ExpectColumnValuesToBeInSet(column=col, value_set=[True, False]) | |
| ) | |
| context.suites.add_or_update(suite) | |
| validation_definition = context.validation_definitions.add( | |
| gx.core.validation_definition.ValidationDefinition( | |
| name=ASSET_NAME + "_validation_processed", | |
| data=batch_definition, | |
| suite=suite, | |
| ) | |
| ) | |
| checkpoint = context.checkpoints.add( | |
| gx.checkpoint.checkpoint.Checkpoint( | |
| name=ASSET_NAME + "_checkpoint_validation_processed", | |
| validation_definitions=[validation_definition], | |
| ) | |
| ) | |
| checkpoint_result = checkpoint.run(batch_parameters={"dataframe": df}) | |
| show_results(checkpoint_result) | |
| if __name__ == "__main__": | |
| df = pd.read_csv(PREPROCESSED_CSV) | |
| context, suite, batch_definition = set_gx( | |
| SOURCE_NAME + "_processed", ASSET_NAME + "_processed", SUITE_NAME + "_processed" | |
| ) | |
| features_columns = [ | |
| "Age", | |
| "Sex", | |
| "RestingBP", | |
| "Cholesterol", | |
| "FastingBS", | |
| "MaxHR", | |
| "ExerciseAngina", | |
| "Oldpeak", | |
| "ChestPainType_ASY", | |
| "ChestPainType_ATA", | |
| "ChestPainType_NAP", | |
| "ChestPainType_TA", | |
| "RestingECG_LVH", | |
| "RestingECG_Normal", | |
| "RestingECG_ST", | |
| "ST_Slope_Down", | |
| "ST_Slope_Flat", | |
| "ST_Slope_Up", | |
| ] | |
| expected_columns = features_columns + ["HeartDisease"] | |
| run_test() | |