Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import joblib | |
| import pandas as pd | |
| import numpy as np | |
| # Paths inside the container image | |
| APP_DIR = "/app" | |
| ASSETS_DIR = os.path.join(APP_DIR, "model_assets") | |
| # Resolve model paths with fallbacks | |
| XGB_CANDIDATES = [ | |
| "XGB_spw.joblib", "XGBoost_best_5cv.joblib", "XGBoost_best.joblib", | |
| "XGBoost.joblib", "xgb_model.joblib", "xgb_full.joblib" | |
| ] | |
| CAT_CANDIDATES = [ | |
| "CAT_cw.joblib", "CatBoost_best_5cv.joblib", "CatBoost_best.joblib", | |
| "CatBoost.joblib", "catboost.joblib", "cat_model.joblib", "cat_full.joblib" | |
| ] | |
| def find_first(path_list): | |
| for name in path_list: | |
| p = os.path.join(ASSETS_DIR, name) | |
| if os.path.exists(p): | |
| return p | |
| return None | |
| def build_sample_input(): | |
| # Use values close to the UI defaults | |
| gender = 1 | |
| height = 170 | |
| weight = 70.0 | |
| ap_hi = 120 | |
| ap_lo = 80 | |
| cholesterol = 1 | |
| gluc = 1 | |
| smoke = 0 | |
| alco = 0 | |
| active = 1 | |
| age_years = 50 | |
| age_days = age_years * 365 | |
| # Derived features | |
| bmi = weight / ((height / 100) ** 2) | |
| bp_diff = ap_hi - ap_lo | |
| systolic_pressure = ap_hi | |
| map_value = ap_lo + (bp_diff / 3) | |
| pulse_ratio = bp_diff / ap_hi if ap_hi > 0 else 0 | |
| obesity_flag = 1 if bmi >= 30 else 0 | |
| hypertension_flag = 1 if (ap_hi >= 140 or ap_lo >= 90) else 0 | |
| lifestyle_score = (1 if smoke == 1 else 0) + (1 if alco == 1 else 0) + (1 if active == 0 else 0) | |
| health_risk_score = lifestyle_score + obesity_flag + hypertension_flag | |
| smoker_alcoholic = 1 if (smoke == 1 or alco == 1) else 0 | |
| age_group = "50-59" | |
| bmi_category = ( | |
| "Underweight" if bmi < 18.5 else "Normal" if bmi < 25 else "Overweight" if bmi < 30 else "Obese" | |
| ) | |
| if ap_hi < 120 and ap_lo < 80: | |
| bp_category = "Normal" | |
| elif ap_hi < 130 and ap_lo < 80: | |
| bp_category = "Elevated" | |
| elif ap_hi < 140 or ap_lo < 90: | |
| bp_category = "Stage 1" | |
| else: | |
| bp_category = "Stage 2" | |
| risk_level = "Low" if health_risk_score <= 2 else "Medium" if health_risk_score <= 4 else "High" | |
| risk_age = age_years + (health_risk_score * 5) | |
| protein_level = 14.0 | |
| ejection_fraction = 60.0 | |
| feature_cols = [ | |
| 'age','gender','height','weight','ap_hi','ap_lo','cholesterol','gluc','smoke','alco','active','BMI','BP_diff', | |
| 'Systolic_Pressure','age_years','Age_Group','Lifestyle_Score','Obesity_Flag','Hypertension_Flag','Health_Risk_Score', | |
| 'Pulse_Pressure_Ratio','MAP','BMI_Category','Smoker_Alcoholic','BP_Category','Risk_Age','Risk_Level','Protein_Level','Ejection_Fraction' | |
| ] | |
| row = { | |
| 'age': age_days, | |
| 'gender': gender, | |
| 'height': height, | |
| 'weight': weight, | |
| 'ap_hi': ap_hi, | |
| 'ap_lo': ap_lo, | |
| 'cholesterol': cholesterol, | |
| 'gluc': gluc, | |
| 'smoke': smoke, | |
| 'alco': alco, | |
| 'active': active, | |
| 'BMI': bmi, | |
| 'BP_diff': bp_diff, | |
| 'Systolic_Pressure': systolic_pressure, | |
| 'age_years': age_years, | |
| 'Age_Group': age_group, | |
| 'Lifestyle_Score': lifestyle_score, | |
| 'Obesity_Flag': obesity_flag, | |
| 'Hypertension_Flag': hypertension_flag, | |
| 'Health_Risk_Score': health_risk_score, | |
| 'Pulse_Pressure_Ratio': pulse_ratio, | |
| 'MAP': map_value, | |
| 'BMI_Category': bmi_category, | |
| 'Smoker_Alcoholic': smoker_alcoholic, | |
| 'BP_Category': bp_category, | |
| 'Risk_Age': risk_age, | |
| 'Risk_Level': risk_level, | |
| 'Protein_Level': protein_level, | |
| 'Ejection_Fraction': ejection_fraction, | |
| } | |
| X = pd.DataFrame([row])[feature_cols] | |
| # One-hot encode categoricals using the same fallback values as app | |
| cat_cols = ['Age_Group', 'BMI_Category', 'BP_Category', 'Risk_Level'] | |
| cat_values = { | |
| 'Age_Group': ['20-29', '30-39', '40-49', '50-59', '60+'], | |
| 'BMI_Category': ['Underweight', 'Normal', 'Overweight', 'Obese'], | |
| 'BP_Category': ['Normal', 'Elevated', 'Stage 1', 'Stage 2'], | |
| 'Risk_Level': ['Low', 'Medium', 'High'], | |
| } | |
| numeric_cols = [c for c in X.columns if c not in cat_cols] | |
| Xn = X[numeric_cols].copy() | |
| parts = [] | |
| for col in cat_cols: | |
| if col in X.columns: | |
| for v in cat_values[col]: | |
| parts.append(pd.Series([1 if X[col].iloc[0] == v else 0], name=f"{col}_{v}")) | |
| Xe = pd.concat(parts, axis=1) if parts else pd.DataFrame(index=X.index) | |
| Xp = pd.concat([Xn, Xe], axis=1).astype(float) | |
| return Xp | |
| def align_for_model(model, Xp): | |
| # Align dataframe columns to model expectations (by name when available) | |
| X_aligned = Xp | |
| if hasattr(model, 'feature_names_in_'): | |
| expected = list(model.feature_names_in_) | |
| Xa = pd.DataFrame(0.0, index=Xp.index, columns=expected) | |
| for c in Xp.columns: | |
| if c in Xa.columns: | |
| Xa[c] = Xp[c].values | |
| X_aligned = Xa[expected] | |
| else: | |
| try: | |
| # xgboost booster feature names | |
| booster = getattr(model, 'get_booster', lambda: None)() | |
| if booster is not None and getattr(booster, 'feature_names', None): | |
| expected = list(booster.feature_names) | |
| Xa = pd.DataFrame(0.0, index=Xp.index, columns=expected) | |
| for c in Xp.columns: | |
| if c in Xa.columns: | |
| Xa[c] = Xp[c].values | |
| X_aligned = Xa[expected] | |
| elif hasattr(model, 'n_features_in_'): | |
| n = int(getattr(model, 'n_features_in_', Xp.shape[1])) | |
| # Fallback: trim or pad to match expected number of features | |
| if Xp.shape[1] >= n: | |
| X_aligned = Xp.iloc[:, :n].copy() | |
| else: | |
| # pad with zero columns | |
| pad = pd.DataFrame(0.0, index=Xp.index, columns=[f"pad_{i}" for i in range(n - Xp.shape[1])]) | |
| X_aligned = pd.concat([Xp, pad], axis=1) | |
| except Exception: | |
| pass | |
| return X_aligned | |
| def main(): | |
| xgb_path = find_first(XGB_CANDIDATES) | |
| cat_path = find_first(CAT_CANDIDATES) | |
| assert xgb_path and os.path.exists(xgb_path), f"XGBoost artifact not found in {ASSETS_DIR}" | |
| assert cat_path and os.path.exists(cat_path), f"CatBoost artifact not found in {ASSETS_DIR}" | |
| xgb = joblib.load(xgb_path) | |
| cat = joblib.load(cat_path) | |
| Xp = build_sample_input() | |
| # Force shape match for XGBoost using n_features_in_ | |
| n_xgb = int(getattr(xgb, 'n_features_in_', Xp.shape[1])) | |
| X_xgb = Xp.iloc[:, :n_xgb].values | |
| print(f"DBG: n_xgb={n_xgb}, Xp.shape={Xp.shape}, X_xgb.shape={X_xgb.shape}") | |
| # Align for CatBoost (by names if available), otherwise force shape | |
| if hasattr(cat, 'feature_names_in_'): | |
| X_cat = align_for_model(cat, Xp) | |
| else: | |
| # CatBoost models often don't expose names; pass full matrix | |
| X_cat = Xp.values | |
| print(f"DBG: X_cat.shape={X_cat.shape}") | |
| if hasattr(xgb, 'predict_proba'): | |
| px = float(xgb.predict_proba(X_xgb)[0, 1]) | |
| else: | |
| px = float(xgb.predict(X_xgb)[0]) | |
| if hasattr(cat, 'predict_proba'): | |
| pc = float(cat.predict_proba(X_cat)[0, 1]) | |
| else: | |
| pc = float(cat.predict(X_cat)[0]) | |
| pe = 0.5 * px + 0.5 * pc | |
| out = { | |
| 'xgb_prob': px, | |
| 'cat_prob': pc, | |
| 'ensemble_prob': pe, | |
| 'ensemble_risk_percent': pe * 100.0, | |
| } | |
| print(json.dumps(out, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |