import os import json import joblib import pandas as pd import numpy as np # Paths inside the container image APP_DIR = "/app" ASSETS_DIR = os.path.join(APP_DIR, "model_assets") # Resolve model paths with fallbacks XGB_CANDIDATES = [ "XGB_spw.joblib", "XGBoost_best_5cv.joblib", "XGBoost_best.joblib", "XGBoost.joblib", "xgb_model.joblib", "xgb_full.joblib" ] CAT_CANDIDATES = [ "CAT_cw.joblib", "CatBoost_best_5cv.joblib", "CatBoost_best.joblib", "CatBoost.joblib", "catboost.joblib", "cat_model.joblib", "cat_full.joblib" ] def find_first(path_list): for name in path_list: p = os.path.join(ASSETS_DIR, name) if os.path.exists(p): return p return None def build_sample_input(): # Use values close to the UI defaults gender = 1 height = 170 weight = 70.0 ap_hi = 120 ap_lo = 80 cholesterol = 1 gluc = 1 smoke = 0 alco = 0 active = 1 age_years = 50 age_days = age_years * 365 # Derived features bmi = weight / ((height / 100) ** 2) bp_diff = ap_hi - ap_lo systolic_pressure = ap_hi map_value = ap_lo + (bp_diff / 3) pulse_ratio = bp_diff / ap_hi if ap_hi > 0 else 0 obesity_flag = 1 if bmi >= 30 else 0 hypertension_flag = 1 if (ap_hi >= 140 or ap_lo >= 90) else 0 lifestyle_score = (1 if smoke == 1 else 0) + (1 if alco == 1 else 0) + (1 if active == 0 else 0) health_risk_score = lifestyle_score + obesity_flag + hypertension_flag smoker_alcoholic = 1 if (smoke == 1 or alco == 1) else 0 age_group = "50-59" bmi_category = ( "Underweight" if bmi < 18.5 else "Normal" if bmi < 25 else "Overweight" if bmi < 30 else "Obese" ) if ap_hi < 120 and ap_lo < 80: bp_category = "Normal" elif ap_hi < 130 and ap_lo < 80: bp_category = "Elevated" elif ap_hi < 140 or ap_lo < 90: bp_category = "Stage 1" else: bp_category = "Stage 2" risk_level = "Low" if health_risk_score <= 2 else "Medium" if health_risk_score <= 4 else "High" risk_age = age_years + (health_risk_score * 5) protein_level = 14.0 ejection_fraction = 60.0 feature_cols = [ 'age','gender','height','weight','ap_hi','ap_lo','cholesterol','gluc','smoke','alco','active','BMI','BP_diff', 'Systolic_Pressure','age_years','Age_Group','Lifestyle_Score','Obesity_Flag','Hypertension_Flag','Health_Risk_Score', 'Pulse_Pressure_Ratio','MAP','BMI_Category','Smoker_Alcoholic','BP_Category','Risk_Age','Risk_Level','Protein_Level','Ejection_Fraction' ] row = { 'age': age_days, 'gender': gender, 'height': height, 'weight': weight, 'ap_hi': ap_hi, 'ap_lo': ap_lo, 'cholesterol': cholesterol, 'gluc': gluc, 'smoke': smoke, 'alco': alco, 'active': active, 'BMI': bmi, 'BP_diff': bp_diff, 'Systolic_Pressure': systolic_pressure, 'age_years': age_years, 'Age_Group': age_group, 'Lifestyle_Score': lifestyle_score, 'Obesity_Flag': obesity_flag, 'Hypertension_Flag': hypertension_flag, 'Health_Risk_Score': health_risk_score, 'Pulse_Pressure_Ratio': pulse_ratio, 'MAP': map_value, 'BMI_Category': bmi_category, 'Smoker_Alcoholic': smoker_alcoholic, 'BP_Category': bp_category, 'Risk_Age': risk_age, 'Risk_Level': risk_level, 'Protein_Level': protein_level, 'Ejection_Fraction': ejection_fraction, } X = pd.DataFrame([row])[feature_cols] # One-hot encode categoricals using the same fallback values as app cat_cols = ['Age_Group', 'BMI_Category', 'BP_Category', 'Risk_Level'] cat_values = { 'Age_Group': ['20-29', '30-39', '40-49', '50-59', '60+'], 'BMI_Category': ['Underweight', 'Normal', 'Overweight', 'Obese'], 'BP_Category': ['Normal', 'Elevated', 'Stage 1', 'Stage 2'], 'Risk_Level': ['Low', 'Medium', 'High'], } numeric_cols = [c for c in X.columns if c not in cat_cols] Xn = X[numeric_cols].copy() parts = [] for col in cat_cols: if col in X.columns: for v in cat_values[col]: parts.append(pd.Series([1 if X[col].iloc[0] == v else 0], name=f"{col}_{v}")) Xe = pd.concat(parts, axis=1) if parts else pd.DataFrame(index=X.index) Xp = pd.concat([Xn, Xe], axis=1).astype(float) return Xp def align_for_model(model, Xp): # Align dataframe columns to model expectations (by name when available) X_aligned = Xp if hasattr(model, 'feature_names_in_'): expected = list(model.feature_names_in_) Xa = pd.DataFrame(0.0, index=Xp.index, columns=expected) for c in Xp.columns: if c in Xa.columns: Xa[c] = Xp[c].values X_aligned = Xa[expected] else: try: # xgboost booster feature names booster = getattr(model, 'get_booster', lambda: None)() if booster is not None and getattr(booster, 'feature_names', None): expected = list(booster.feature_names) Xa = pd.DataFrame(0.0, index=Xp.index, columns=expected) for c in Xp.columns: if c in Xa.columns: Xa[c] = Xp[c].values X_aligned = Xa[expected] elif hasattr(model, 'n_features_in_'): n = int(getattr(model, 'n_features_in_', Xp.shape[1])) # Fallback: trim or pad to match expected number of features if Xp.shape[1] >= n: X_aligned = Xp.iloc[:, :n].copy() else: # pad with zero columns pad = pd.DataFrame(0.0, index=Xp.index, columns=[f"pad_{i}" for i in range(n - Xp.shape[1])]) X_aligned = pd.concat([Xp, pad], axis=1) except Exception: pass return X_aligned def main(): xgb_path = find_first(XGB_CANDIDATES) cat_path = find_first(CAT_CANDIDATES) assert xgb_path and os.path.exists(xgb_path), f"XGBoost artifact not found in {ASSETS_DIR}" assert cat_path and os.path.exists(cat_path), f"CatBoost artifact not found in {ASSETS_DIR}" xgb = joblib.load(xgb_path) cat = joblib.load(cat_path) Xp = build_sample_input() # Force shape match for XGBoost using n_features_in_ n_xgb = int(getattr(xgb, 'n_features_in_', Xp.shape[1])) X_xgb = Xp.iloc[:, :n_xgb].values print(f"DBG: n_xgb={n_xgb}, Xp.shape={Xp.shape}, X_xgb.shape={X_xgb.shape}") # Align for CatBoost (by names if available), otherwise force shape if hasattr(cat, 'feature_names_in_'): X_cat = align_for_model(cat, Xp) else: # CatBoost models often don't expose names; pass full matrix X_cat = Xp.values print(f"DBG: X_cat.shape={X_cat.shape}") if hasattr(xgb, 'predict_proba'): px = float(xgb.predict_proba(X_xgb)[0, 1]) else: px = float(xgb.predict(X_xgb)[0]) if hasattr(cat, 'predict_proba'): pc = float(cat.predict_proba(X_cat)[0, 1]) else: pc = float(cat.predict(X_cat)[0]) pe = 0.5 * px + 0.5 * pc out = { 'xgb_prob': px, 'cat_prob': pc, 'ensemble_prob': pe, 'ensemble_risk_percent': pe * 100.0, } print(json.dumps(out, indent=2)) if __name__ == "__main__": main()