Spaces:
Sleeping
Sleeping
File size: 5,183 Bytes
4ba360f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import great_expectations as ge
from great_expectations.dataset import PandasDataset
from typing import Tuple, List
def validate_telco_data(df) -> Tuple[bool, List[str]]:
"""
Comprehensive data validation for Telco Customer Churn dataset using Great Expectations.
This function implements critical data quality checks that must pass before model training.
It validates data integrity, business logic constraints, and statistical properties
that the ML model expects.
"""
print("π Starting data validation with Great Expectations...")
# Convert pandas DataFrame to Great Expectations Dataset
ge_df = PandasDataset(df)
# === SCHEMA VALIDATION - ESSENTIAL COLUMNS ===
print(" π Validating schema and required columns...")
# Customer identifier must exist (required for business operations)
ge_df.expect_column_to_exist("customerID")
ge_df.expect_column_values_to_not_be_null("customerID")
# Core demographic features
ge_df.expect_column_to_exist("gender")
ge_df.expect_column_to_exist("Partner")
ge_df.expect_column_to_exist("Dependents")
# Service features (critical for churn analysis)
ge_df.expect_column_to_exist("PhoneService")
ge_df.expect_column_to_exist("InternetService")
ge_df.expect_column_to_exist("Contract")
# Financial features (key churn predictors)
ge_df.expect_column_to_exist("tenure")
ge_df.expect_column_to_exist("MonthlyCharges")
ge_df.expect_column_to_exist("TotalCharges")
# === BUSINESS LOGIC VALIDATION ===
print(" πΌ Validating business logic constraints...")
# Gender must be one of expected values (data integrity)
ge_df.expect_column_values_to_be_in_set("gender", ["Male", "Female"])
# Yes/No fields must have valid values
ge_df.expect_column_values_to_be_in_set("Partner", ["Yes", "No"])
ge_df.expect_column_values_to_be_in_set("Dependents", ["Yes", "No"])
ge_df.expect_column_values_to_be_in_set("PhoneService", ["Yes", "No"])
# Contract types must be valid (business constraint)
ge_df.expect_column_values_to_be_in_set(
"Contract",
["Month-to-month", "One year", "Two year"]
)
# Internet service types (business constraint)
ge_df.expect_column_values_to_be_in_set(
"InternetService",
["DSL", "Fiber optic", "No"]
)
# === NUMERIC RANGE VALIDATION ===
print(" π Validating numeric ranges and business constraints...")
# Tenure must be non-negative (business logic - can't have negative tenure)
ge_df.expect_column_values_to_be_between("tenure", min_value=0)
# Monthly charges must be positive (business logic - no free service)
ge_df.expect_column_values_to_be_between("MonthlyCharges", min_value=0)
# Total charges should be non-negative (business logic)
ge_df.expect_column_values_to_be_between("TotalCharges", min_value=0)
# === STATISTICAL VALIDATION ===
print(" π Validating statistical properties...")
# Tenure should be reasonable (max ~10 years = 120 months for telecom)
ge_df.expect_column_values_to_be_between("tenure", min_value=0, max_value=120)
# Monthly charges should be within reasonable business range
ge_df.expect_column_values_to_be_between("MonthlyCharges", min_value=0, max_value=200)
# No missing values in critical numeric features
ge_df.expect_column_values_to_not_be_null("tenure")
ge_df.expect_column_values_to_not_be_null("MonthlyCharges")
# === DATA CONSISTENCY CHECKS ===
print(" π Validating data consistency...")
# Total charges should generally be >= Monthly charges (except for very new customers)
# This is a business logic check to catch data entry errors
ge_df.expect_column_pair_values_A_to_be_greater_than_B(
column_A="TotalCharges",
column_B="MonthlyCharges",
or_equal=True,
mostly=0.95 # Allow 5% exceptions for edge cases
)
# === RUN VALIDATION SUITE ===
print(" βοΈ Running complete validation suite...")
results = ge_df.validate()
# === PROCESS RESULTS ===
# Extract failed expectations for detailed error reporting
failed_expectations = []
for r in results["results"]:
if not r["success"]:
expectation_type = r["expectation_config"]["expectation_type"]
failed_expectations.append(expectation_type)
# Print validation summary
total_checks = len(results["results"])
passed_checks = sum(1 for r in results["results"] if r["success"])
failed_checks = total_checks - passed_checks
if results["success"]:
print(f"β
Data validation PASSED: {passed_checks}/{total_checks} checks successful")
else:
print(f"β Data validation FAILED: {failed_checks}/{total_checks} checks failed")
print(f" Failed expectations: {failed_expectations}")
return results["success"], failed_expectations |