flightt / train.py
saad1BM's picture
Upload 10 files
cc12750 verified
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import mlflow
import mlflow.sklearn
import joblib
import os
mlflow.set_tracking_uri("sqlite:///mlflow.db")
def train_model():
print("Model Training started")
db_path = 'sqlite:///data/flights_database.db'
engine = create_engine(db_path)
try:
df = pd.read_sql('SELECT * FROM cleaned_flights', engine)
except Exception as e:
print(f"Error: Database data not found. Check it,: {e}")
return
features = ['MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'CRS_DEP_TIME', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST']
X = df[features].copy()
y = df['is_delayed']
encoders = {}
for col in ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST']:
le = LabelEncoder()
X[col] = le.fit_transform(X[col])
encoders[col] = le
os.makedirs('models', exist_ok=True)
joblib.dump(encoders, 'models/label_encoders.joblib')
print("All Label Encoders saved to models/label_encoders.joblib")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
mlflow.set_experiment("Flight_Delay_Prediction")
with mlflow.start_run():
print("XGBoost Model train")
params = {
"n_estimators": 100,
"max_depth": 5,
"learning_rate": 0.1,
"use_label_encoder": False,
"eval_metric": "logloss"
}
model = XGBClassifier(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"Accuracy: {acc:.2f}")
print(f"ROC-AUC: {auc:.2f}")
mlflow.log_params(params)
mlflow.log_metric("accuracy", acc)
mlflow.log_metric("roc_auc", auc)
mlflow.sklearn.log_model(model, "model")
joblib.dump(model, 'models/flight_model.joblib')
print("Model saved: models/flight_model.joblib")
if __name__ == "__main__":
train_model()