# This Python file uses the following encoding: utf-8 # Permet de générer le modèle à partir de la base de données # Reprise et adaptation du notebook du projet3 import json import os from datetime import datetime import joblib import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from sklearn.model_selection import cross_validate, train_test_split from sqlalchemy import create_engine DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://user:password@localhost/dbname") def get_for_model(features): # Separation du jeu de données en train, test X = building_consumption[features] y = building_consumption["log_" + var_a_predire] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, shuffle=True ) # X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) print(f"Shape de X_train: {X_train.shape}") print(f"Shape de y_train: {y_train.shape}") print(f"Shape de X_test: {X_test.shape}") print(f"Shape de y_train: {y_test.shape}") return X_train, X_test, y_train, y_test def get_score(y, prediction): mse = mean_squared_error(y, prediction) mae = mean_absolute_error(y, prediction) r2 = r2_score(y, prediction) return mse, mae, r2 scores = {} engine = create_engine(DATABASE_URL) query = "SELECT * FROM model_view" building_consumption = pd.read_sql(query, engine) print(building_consumption.info()) engine.dispose() var_a_predire = "site_energy_use_wn_kbtu" building_consumption["log_" + var_a_predire] = np.log( building_consumption[var_a_predire] ) # Features features = [ "year_built", "number_of_buildings", "number_of_floors", "property_gfa_total", "property_gfa_parking", "second_largest_property_use_type_gfa", "third_largest_property_use_type_gfa", "multiusage", "steam", "electricity", "natural_gas", "neighborhood_id", "building_type_id", "largest_property_use_type_id", "primary_property_type_id", "second_largest_property_use_type_id", "third_largest_property_use_type_id", ] X_train, X_test, y_train, y_test = get_for_model(features) # Initialisation du modèle rf = RandomForestRegressor( n_estimators=500, max_features=0.5, random_state=42, min_samples_split=5, max_depth=20, ) # Entraînement sur l'ensemble des données X = building_consumption[features] y = building_consumption["log_" + var_a_predire] rf.fit(X, y) # Prédiction y_pred = rf.predict(X_test) scores_cross = cross_validate( rf, X_train, y_train, cv=5 ) # cv=5 pour une validation croisée à 5 plis fit_time = scores_cross["fit_time"].mean() score_time = scores_cross["score_time"].mean() scores.update( {"RandomForestRegressor HP": get_score(y_test, y_pred) + (fit_time, score_time)} ) resultats = pd.DataFrame(scores).T resultats.columns = ["RMSE", "MAE", "R^2", "Fit Time", "Score Time"] resultats = resultats.round(2) print(resultats[-1:1]["RMSE"].values) # Sauvegarde du modèle # Informations du modèle model_info = { "model_type": type(rf).__name__, "model_module": type(rf).__module__, "has_feature_importances": hasattr(rf, "feature_importances_"), "has_coefficients": hasattr(rf, "coef_"), "has_predict_proba": hasattr(rf, "predict_proba"), "version": "1.0", "created_at": datetime.now().isoformat(), "features": list(X_train.columns), "accuracy": rf.score(X_test, y_test), "RMSE": resultats[-1:1]["RMSE"].values[0], "MAE": resultats[-1:1]["MAE"].values[0], "R^2": resultats[-1:1]["R^2"].values[0], } # Sauvegarder modèle et métadonnées joblib.dump(rf, "model.pkl") with open("model_info.json", "w") as f: json.dump(model_info, f, indent=2)