Spaces:

logan-codes
/

telco-churn-predictor

Sleeping

App Files Files Community

telco-churn-predictor / src /utils /validate_data.py

logan-codes

Add Dockerfile, Gradio app, and core src modules

4ba360f about 1 month ago

raw

history blame contribute delete

5.18 kB

	import great_expectations as ge
	from great_expectations.dataset import PandasDataset
	from typing import Tuple, List


	def validate_telco_data(df) -> Tuple[bool, List[str]]:
	"""
	Comprehensive data validation for Telco Customer Churn dataset using Great Expectations.

	This function implements critical data quality checks that must pass before model training.
	It validates data integrity, business logic constraints, and statistical properties
	that the ML model expects.

	"""
	print("🔍 Starting data validation with Great Expectations...")

	# Convert pandas DataFrame to Great Expectations Dataset
	ge_df = PandasDataset(df)

	# === SCHEMA VALIDATION - ESSENTIAL COLUMNS ===
	print(" 📋 Validating schema and required columns...")

	# Customer identifier must exist (required for business operations)
	ge_df.expect_column_to_exist("customerID")
	ge_df.expect_column_values_to_not_be_null("customerID")

	# Core demographic features
	ge_df.expect_column_to_exist("gender")
	ge_df.expect_column_to_exist("Partner")
	ge_df.expect_column_to_exist("Dependents")

	# Service features (critical for churn analysis)
	ge_df.expect_column_to_exist("PhoneService")
	ge_df.expect_column_to_exist("InternetService")
	ge_df.expect_column_to_exist("Contract")

	# Financial features (key churn predictors)
	ge_df.expect_column_to_exist("tenure")
	ge_df.expect_column_to_exist("MonthlyCharges")
	ge_df.expect_column_to_exist("TotalCharges")

	# === BUSINESS LOGIC VALIDATION ===
	print(" 💼 Validating business logic constraints...")

	# Gender must be one of expected values (data integrity)
	ge_df.expect_column_values_to_be_in_set("gender", ["Male", "Female"])

	# Yes/No fields must have valid values
	ge_df.expect_column_values_to_be_in_set("Partner", ["Yes", "No"])
	ge_df.expect_column_values_to_be_in_set("Dependents", ["Yes", "No"])
	ge_df.expect_column_values_to_be_in_set("PhoneService", ["Yes", "No"])

	# Contract types must be valid (business constraint)
	ge_df.expect_column_values_to_be_in_set(
	"Contract",
	["Month-to-month", "One year", "Two year"]
	)

	# Internet service types (business constraint)
	ge_df.expect_column_values_to_be_in_set(
	"InternetService",
	["DSL", "Fiber optic", "No"]
	)

	# === NUMERIC RANGE VALIDATION ===
	print(" 📊 Validating numeric ranges and business constraints...")

	# Tenure must be non-negative (business logic - can't have negative tenure)
	ge_df.expect_column_values_to_be_between("tenure", min_value=0)

	# Monthly charges must be positive (business logic - no free service)
	ge_df.expect_column_values_to_be_between("MonthlyCharges", min_value=0)

	# Total charges should be non-negative (business logic)
	ge_df.expect_column_values_to_be_between("TotalCharges", min_value=0)

	# === STATISTICAL VALIDATION ===
	print(" 📈 Validating statistical properties...")

	# Tenure should be reasonable (max ~10 years = 120 months for telecom)
	ge_df.expect_column_values_to_be_between("tenure", min_value=0, max_value=120)

	# Monthly charges should be within reasonable business range
	ge_df.expect_column_values_to_be_between("MonthlyCharges", min_value=0, max_value=200)

	# No missing values in critical numeric features
	ge_df.expect_column_values_to_not_be_null("tenure")
	ge_df.expect_column_values_to_not_be_null("MonthlyCharges")

	# === DATA CONSISTENCY CHECKS ===
	print(" 🔗 Validating data consistency...")

	# Total charges should generally be >= Monthly charges (except for very new customers)
	# This is a business logic check to catch data entry errors
	ge_df.expect_column_pair_values_A_to_be_greater_than_B(
	column_A="TotalCharges",
	column_B="MonthlyCharges",
	or_equal=True,
	mostly=0.95 # Allow 5% exceptions for edge cases
	)

	# === RUN VALIDATION SUITE ===
	print(" ⚙️ Running complete validation suite...")
	results = ge_df.validate()

	# === PROCESS RESULTS ===
	# Extract failed expectations for detailed error reporting
	failed_expectations = []
	for r in results["results"]:
	if not r["success"]:
	expectation_type = r["expectation_config"]["expectation_type"]
	failed_expectations.append(expectation_type)

	# Print validation summary
	total_checks = len(results["results"])
	passed_checks = sum(1 for r in results["results"] if r["success"])
	failed_checks = total_checks - passed_checks

	if results["success"]:
	print(f"✅ Data validation PASSED: {passed_checks}/{total_checks} checks successful")
	else:
	print(f"❌ Data validation FAILED: {failed_checks}/{total_checks} checks failed")
	print(f" Failed expectations: {failed_expectations}")

	return results["success"], failed_expectations