harishsohani commited on
Commit
dc5f100
·
verified ·
1 Parent(s): 921bf26

Create utils/validation.py

Browse files
Files changed (1) hide show
  1. utils/validation.py +56 -0
utils/validation.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ class InputValidationError(Exception):
5
+ """Custom exception for input validation errors."""
6
+ pass
7
+
8
+
9
+ def validate_and_prepare_input(input_df: pd.DataFrame, model):
10
+ """
11
+ Validates input dataframe against model expected features.
12
+ Returns a clean dataframe ready for prediction.
13
+ """
14
+
15
+ if not isinstance(input_df, pd.DataFrame):
16
+ raise InputValidationError("Input must be a pandas DataFrame.")
17
+
18
+ # Get expected feature names from trained XGBoost model
19
+ try:
20
+ expected_features = model.get_booster().feature_names
21
+ except Exception:
22
+ raise InputValidationError("Unable to retrieve model feature names.")
23
+
24
+ # -------------------------
25
+ # 1️⃣ Check missing columns
26
+ # -------------------------
27
+ missing_cols = set(expected_features) - set(input_df.columns)
28
+ if missing_cols:
29
+ raise InputValidationError(
30
+ f"Missing required columns: {list(missing_cols)}"
31
+ )
32
+
33
+ # -------------------------
34
+ # 2️⃣ Check extra columns
35
+ # -------------------------
36
+ extra_cols = set(input_df.columns) - set(expected_features)
37
+ if extra_cols:
38
+ raise InputValidationError(
39
+ f"Unexpected columns provided: {list(extra_cols)}"
40
+ )
41
+
42
+ # -------------------------
43
+ # 3️⃣ Enforce numeric types
44
+ # -------------------------
45
+ for col in expected_features:
46
+ if not pd.api.types.is_numeric_dtype(input_df[col]):
47
+ raise InputValidationError(
48
+ f"Column '{col}' must be numeric."
49
+ )
50
+
51
+ # -------------------------
52
+ # 4️⃣ Reorder columns safely
53
+ # -------------------------
54
+ input_df = input_df[expected_features]
55
+
56
+ return input_df