import gradio as gr import pandas as pd import joblib import numpy as np from sklearn.impute import SimpleImputer # ----------------------------- # Load new tuned classification model package # ----------------------------- # This file should be created from your training script: # joblib.dump({"model": ensemble, "threshold": best_threshold, "columns": list(X_train.columns)}, "main/final_delay_model.pkl") model_package = joblib.load("main/final_delay_model.pkl") ensemble_model = model_package["model"] best_threshold = model_package["threshold"] reference_columns = model_package["columns"] # ----------------------------- # Load regression models and training columns # ----------------------------- ridge_model = joblib.load("main/ridge_model.pkl") xgb_reg_model = joblib.load("main/xgb_model.pkl") gbr_reg_model = joblib.load("main/gbr_model.pkl") training_columns_reg = joblib.load("main/training_columns.pkl") # ----------------------------- # Preprocessing for classification # ----------------------------- def preprocess_classification(df): categorical_cols = ['UNIQUE_CARRIER', 'CARRIER', 'ORIGIN', 'DEST', 'ORIGIN_STATE_ABR', 'DEST_STATE_ABR', 'DEP_TIME_BLK', 'ARR_TIME_BLK'] df_encoded = pd.get_dummies(df, columns=categorical_cols) # Add missing columns from training for col in reference_columns: if col not in df_encoded.columns: df_encoded[col] = 0 # Reorder columns df_encoded = df_encoded[reference_columns] # Impute missing values imputer = SimpleImputer(strategy='median') df_encoded = pd.DataFrame(imputer.fit_transform(df_encoded), columns=df_encoded.columns) return df_encoded # ----------------------------- # Preprocessing for regression # ----------------------------- def preprocess_regression(df): df_encoded = pd.get_dummies(df, columns=['time_of_day', 'wind_dir_bucket']) df_encoded = df_encoded.reindex(columns=training_columns_reg, fill_value=0) return df_encoded # ----------------------------- # Delay category helper # ----------------------------- def categorize_delay(minutes): if minutes < 15: return "Delay not considered less than 15 mins" elif 15 <= minutes < 20: return "Delay is Minimum" elif 20 <= minutes < 30: return "Flight is moderately delayed" elif 30 <= minutes < 60: return "Flight is highly delayed" else: return "Flight is delayed too much" # ----------------------------- # Classification prediction function # ----------------------------- def predict_classification(YEAR, MONTH, DAY_OF_MONTH, DAY_OF_WEEK, ORIGIN, DEST, CARRIER, ORIGIN_STATE_ABR, DEST_STATE_ABR, DEP_TIME_BLK, ARR_TIME_BLK, temp, prcp, wspd, wdir, route_delay_rate): data = { 'YEAR': int(YEAR), 'MONTH': int(MONTH), 'DAY_OF_MONTH': int(DAY_OF_MONTH), 'DAY_OF_WEEK': int(DAY_OF_WEEK), 'UNIQUE_CARRIER': CARRIER, 'CARRIER': CARRIER, 'ORIGIN': ORIGIN, 'DEST': DEST, 'ORIGIN_STATE_ABR': ORIGIN_STATE_ABR, 'DEST_STATE_ABR': DEST_STATE_ABR, 'DEP_TIME_BLK': DEP_TIME_BLK, 'ARR_TIME_BLK': ARR_TIME_BLK, 'temp': float(temp), 'prcp': float(prcp), 'wspd': float(wspd), 'wdir': float(wdir), 'route_delay_rate': float(route_delay_rate) } df_input = pd.DataFrame([data]) X = preprocess_classification(df_input) proba = ensemble_model.predict_proba(X)[0][1] pred = int(proba >= best_threshold) return { "Prediction": "Delayed" if pred == 1 else "On Time", "Confidence": round(proba, 3), "Threshold": round(best_threshold, 3) } # ----------------------------- # Regression prediction function (unchanged) # ----------------------------- def predict_regression_with_check(DEP_DELAY, DEP_DELAY_NEW, DEP_DEL15, DEP_DELAY_GROUP, temp, prcp, wspd, wdir, bad_weather, wind_dir_bucket, time_of_day, is_weekend): if int(DEP_DEL15) == 0: return { "Status": "No delay predicted", "Delay Category": None } data = { 'DEP_DELAY': float(DEP_DELAY), 'DEP_DELAY_NEW': float(DEP_DELAY_NEW), 'DEP_DEL15': int(DEP_DEL15), 'DEP_DELAY_GROUP': int(DEP_DELAY_GROUP), 'temp': float(temp), 'prcp': float(prcp), 'wspd': float(wspd), 'wdir': float(wdir), 'bad_weather': int(bad_weather), 'wind_dir_bucket': wind_dir_bucket, 'time_of_day': time_of_day, 'is_weekend': int(is_weekend) } df_input = pd.DataFrame([data]) X = preprocess_regression(df_input) pred_ridge = ridge_model.predict(X)[0] pred_xgb = xgb_reg_model.predict(X)[0] pred_gbr = gbr_reg_model.predict(X)[0] max_pred = max(pred_ridge, pred_xgb, pred_gbr) category = categorize_delay(max_pred) return { "Ridge Prediction": round(pred_ridge, 2), "XGBoost Prediction": round(pred_xgb, 2), "Gradient Boosting Prediction": round(pred_gbr, 2), "Max Prediction": round(max_pred, 2), "Delay Category": category } # ----------------------------- # Gradio Interface # ----------------------------- classification_inputs = [ gr.Number(label="YEAR"), gr.Number(label="MONTH"), gr.Number(label="DAY_OF_MONTH"), gr.Number(label="DAY_OF_WEEK (1=Mon ... 7=Sun)"), gr.Textbox(label="Origin Airport Code"), gr.Textbox(label="Destination Airport Code"), gr.Textbox(label="Carrier Code"), gr.Textbox(label="Origin State Abbreviation"), gr.Textbox(label="Destination State Abbreviation"), gr.Textbox(label="Departure Time Block (e.g., 0600-0659)"), gr.Textbox(label="Arrival Time Block (e.g., 0900-0959)"), gr.Number(label="Temperature"), gr.Number(label="Precipitation"), gr.Number(label="Wind Speed"), gr.Number(label="Wind Direction"), gr.Number(label="Route Delay Rate (historical)") ] regression_inputs = [ gr.Number(label="DEP_DELAY"), gr.Number(label="DEP_DELAY_NEW"), gr.Number(label="DEP_DEL15 (0 or 1)"), gr.Number(label="DEP_DELAY_GROUP"), gr.Number(label="Temperature"), gr.Number(label="Precipitation"), gr.Number(label="Wind Speed"), gr.Number(label="Wind Direction"), gr.Number(label="Bad Weather (0 or 1)"), gr.Textbox(label="Wind Dir Bucket (North/South/East/West/etc.)"), gr.Textbox(label="Time of Day (Morning/Afternoon/Evening/Night)"), gr.Number(label="Is Weekend (0 or 1)") ] classification_tab = gr.Interface( fn=predict_classification, inputs=classification_inputs, outputs="json", title="Flight Delay Classification (Tuned Ensemble)", description="Predict delay classification using the tuned ensemble model with threshold optimization." ) regression_tab = gr.Interface( fn=predict_regression_with_check, inputs=regression_inputs, outputs="json", title="Flight Delay Regression (Conditional)", description="Predict arrival delay in minutes only if DEP_DEL15=1, with categorized output." ) demo = gr.TabbedInterface([classification_tab, regression_tab], ["Classification", "Regression"]) if __name__ == "__main__": demo.launch()