| import pandas as pd | |
| import sqlite3 | |
| from sqlalchemy import create_engine | |
| from xgboost import XGBClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, roc_auc_score | |
| from sklearn.preprocessing import LabelEncoder | |
| import mlflow | |
| import mlflow.sklearn | |
| import joblib | |
| import os | |
| mlflow.set_tracking_uri("sqlite:///mlflow.db") | |
| def train_model(): | |
| print("Model Training started") | |
| db_path = 'sqlite:///data/flights_database.db' | |
| engine = create_engine(db_path) | |
| try: | |
| df = pd.read_sql('SELECT * FROM cleaned_flights', engine) | |
| except Exception as e: | |
| print(f"Error: Database data not found. Check it,: {e}") | |
| return | |
| features = ['MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'CRS_DEP_TIME', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST'] | |
| X = df[features].copy() | |
| y = df['is_delayed'] | |
| encoders = {} | |
| for col in ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST']: | |
| le = LabelEncoder() | |
| X[col] = le.fit_transform(X[col]) | |
| encoders[col] = le | |
| os.makedirs('models', exist_ok=True) | |
| joblib.dump(encoders, 'models/label_encoders.joblib') | |
| print("All Label Encoders saved to models/label_encoders.joblib") | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| mlflow.set_experiment("Flight_Delay_Prediction") | |
| with mlflow.start_run(): | |
| print("XGBoost Model train") | |
| params = { | |
| "n_estimators": 100, | |
| "max_depth": 5, | |
| "learning_rate": 0.1, | |
| "use_label_encoder": False, | |
| "eval_metric": "logloss" | |
| } | |
| model = XGBClassifier(**params) | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| y_proba = model.predict_proba(X_test)[:, 1] | |
| acc = accuracy_score(y_test, y_pred) | |
| auc = roc_auc_score(y_test, y_proba) | |
| print(f"Accuracy: {acc:.2f}") | |
| print(f"ROC-AUC: {auc:.2f}") | |
| mlflow.log_params(params) | |
| mlflow.log_metric("accuracy", acc) | |
| mlflow.log_metric("roc_auc", auc) | |
| mlflow.sklearn.log_model(model, "model") | |
| joblib.dump(model, 'models/flight_model.joblib') | |
| print("Model saved: models/flight_model.joblib") | |
| if __name__ == "__main__": | |
| train_model() |