Spaces:
Sleeping
Sleeping
| import great_expectations as ge | |
| from great_expectations.dataset import PandasDataset | |
| from typing import Tuple, List | |
| def validate_telco_data(df) -> Tuple[bool, List[str]]: | |
| """ | |
| Comprehensive data validation for Telco Customer Churn dataset using Great Expectations. | |
| This function implements critical data quality checks that must pass before model training. | |
| It validates data integrity, business logic constraints, and statistical properties | |
| that the ML model expects. | |
| """ | |
| print("π Starting data validation with Great Expectations...") | |
| # Convert pandas DataFrame to Great Expectations Dataset | |
| ge_df = PandasDataset(df) | |
| # === SCHEMA VALIDATION - ESSENTIAL COLUMNS === | |
| print(" π Validating schema and required columns...") | |
| # Customer identifier must exist (required for business operations) | |
| ge_df.expect_column_to_exist("customerID") | |
| ge_df.expect_column_values_to_not_be_null("customerID") | |
| # Core demographic features | |
| ge_df.expect_column_to_exist("gender") | |
| ge_df.expect_column_to_exist("Partner") | |
| ge_df.expect_column_to_exist("Dependents") | |
| # Service features (critical for churn analysis) | |
| ge_df.expect_column_to_exist("PhoneService") | |
| ge_df.expect_column_to_exist("InternetService") | |
| ge_df.expect_column_to_exist("Contract") | |
| # Financial features (key churn predictors) | |
| ge_df.expect_column_to_exist("tenure") | |
| ge_df.expect_column_to_exist("MonthlyCharges") | |
| ge_df.expect_column_to_exist("TotalCharges") | |
| # === BUSINESS LOGIC VALIDATION === | |
| print(" πΌ Validating business logic constraints...") | |
| # Gender must be one of expected values (data integrity) | |
| ge_df.expect_column_values_to_be_in_set("gender", ["Male", "Female"]) | |
| # Yes/No fields must have valid values | |
| ge_df.expect_column_values_to_be_in_set("Partner", ["Yes", "No"]) | |
| ge_df.expect_column_values_to_be_in_set("Dependents", ["Yes", "No"]) | |
| ge_df.expect_column_values_to_be_in_set("PhoneService", ["Yes", "No"]) | |
| # Contract types must be valid (business constraint) | |
| ge_df.expect_column_values_to_be_in_set( | |
| "Contract", | |
| ["Month-to-month", "One year", "Two year"] | |
| ) | |
| # Internet service types (business constraint) | |
| ge_df.expect_column_values_to_be_in_set( | |
| "InternetService", | |
| ["DSL", "Fiber optic", "No"] | |
| ) | |
| # === NUMERIC RANGE VALIDATION === | |
| print(" π Validating numeric ranges and business constraints...") | |
| # Tenure must be non-negative (business logic - can't have negative tenure) | |
| ge_df.expect_column_values_to_be_between("tenure", min_value=0) | |
| # Monthly charges must be positive (business logic - no free service) | |
| ge_df.expect_column_values_to_be_between("MonthlyCharges", min_value=0) | |
| # Total charges should be non-negative (business logic) | |
| ge_df.expect_column_values_to_be_between("TotalCharges", min_value=0) | |
| # === STATISTICAL VALIDATION === | |
| print(" π Validating statistical properties...") | |
| # Tenure should be reasonable (max ~10 years = 120 months for telecom) | |
| ge_df.expect_column_values_to_be_between("tenure", min_value=0, max_value=120) | |
| # Monthly charges should be within reasonable business range | |
| ge_df.expect_column_values_to_be_between("MonthlyCharges", min_value=0, max_value=200) | |
| # No missing values in critical numeric features | |
| ge_df.expect_column_values_to_not_be_null("tenure") | |
| ge_df.expect_column_values_to_not_be_null("MonthlyCharges") | |
| # === DATA CONSISTENCY CHECKS === | |
| print(" π Validating data consistency...") | |
| # Total charges should generally be >= Monthly charges (except for very new customers) | |
| # This is a business logic check to catch data entry errors | |
| ge_df.expect_column_pair_values_A_to_be_greater_than_B( | |
| column_A="TotalCharges", | |
| column_B="MonthlyCharges", | |
| or_equal=True, | |
| mostly=0.95 # Allow 5% exceptions for edge cases | |
| ) | |
| # === RUN VALIDATION SUITE === | |
| print(" βοΈ Running complete validation suite...") | |
| results = ge_df.validate() | |
| # === PROCESS RESULTS === | |
| # Extract failed expectations for detailed error reporting | |
| failed_expectations = [] | |
| for r in results["results"]: | |
| if not r["success"]: | |
| expectation_type = r["expectation_config"]["expectation_type"] | |
| failed_expectations.append(expectation_type) | |
| # Print validation summary | |
| total_checks = len(results["results"]) | |
| passed_checks = sum(1 for r in results["results"] if r["success"]) | |
| failed_checks = total_checks - passed_checks | |
| if results["success"]: | |
| print(f"β Data validation PASSED: {passed_checks}/{total_checks} checks successful") | |
| else: | |
| print(f"β Data validation FAILED: {failed_checks}/{total_checks} checks failed") | |
| print(f" Failed expectations: {failed_expectations}") | |
| return results["success"], failed_expectations |