File size: 1,705 Bytes
76fbb44 dc5f100 76fbb44 dc5f100 76fbb44 dc5f100 76fbb44 dc5f100 76fbb44 dc5f100 76fbb44 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import pandas as pd
class InputValidationError(Exception):
"""Custom exception for input validation errors."""
pass
def validate_and_prepare_input(input_df: pd.DataFrame, model):
"""
Validates input dataframe against model expected features.
Returns a clean dataframe ready for prediction.
"""
if not isinstance(input_df, pd.DataFrame):
raise InputValidationError("Input must be a pandas DataFrame.")
# Get expected feature names from trained XGBoost model
try:
expected_features = model.get_booster().feature_names
except Exception:
raise InputValidationError("Unable to retrieve model feature names.")
# -------------------------
# 1 Check missing columns
# -------------------------
missing_cols = set(expected_features) - set(input_df.columns)
if missing_cols:
raise InputValidationError(
f"Missing required columns: {list(missing_cols)}"
)
# -------------------------
# 2 Check extra columns
# -------------------------
extra_cols = set(input_df.columns) - set(expected_features)
if extra_cols:
raise InputValidationError(
f"Unexpected columns provided: {list(extra_cols)}"
)
# -------------------------
# 3 Enforce numeric types
# -------------------------
for col in expected_features:
if not pd.api.types.is_numeric_dtype(input_df[col]):
raise InputValidationError(
f"Column '{col}' must be numeric."
)
# -------------------------
# 4 Reorder columns safely
# -------------------------
input_df = input_df[expected_features]
return input_df
|