Spaces:
Sleeping
Sleeping
| # This Python file uses the following encoding: utf-8 | |
| # Permet de générer le modèle à partir de la base de données | |
| # Reprise et adaptation du notebook du projet3 | |
| import json | |
| import os | |
| from datetime import datetime | |
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score | |
| from sklearn.model_selection import cross_validate, train_test_split | |
| from sqlalchemy import create_engine | |
| DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://user:password@localhost/dbname") | |
| def get_for_model(features): | |
| # Separation du jeu de données en train, test | |
| X = building_consumption[features] | |
| y = building_consumption["log_" + var_a_predire] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, shuffle=True | |
| ) | |
| # X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) | |
| print(f"Shape de X_train: {X_train.shape}") | |
| print(f"Shape de y_train: {y_train.shape}") | |
| print(f"Shape de X_test: {X_test.shape}") | |
| print(f"Shape de y_train: {y_test.shape}") | |
| return X_train, X_test, y_train, y_test | |
| def get_score(y, prediction): | |
| mse = mean_squared_error(y, prediction) | |
| mae = mean_absolute_error(y, prediction) | |
| r2 = r2_score(y, prediction) | |
| return mse, mae, r2 | |
| scores = {} | |
| engine = create_engine(DATABASE_URL) | |
| query = "SELECT * FROM model_view" | |
| building_consumption = pd.read_sql(query, engine) | |
| print(building_consumption.info()) | |
| engine.dispose() | |
| var_a_predire = "site_energy_use_wn_kbtu" | |
| building_consumption["log_" + var_a_predire] = np.log( | |
| building_consumption[var_a_predire] | |
| ) | |
| # Features | |
| features = [ | |
| "year_built", | |
| "number_of_buildings", | |
| "number_of_floors", | |
| "property_gfa_total", | |
| "property_gfa_parking", | |
| "second_largest_property_use_type_gfa", | |
| "third_largest_property_use_type_gfa", | |
| "multiusage", | |
| "steam", | |
| "electricity", | |
| "natural_gas", | |
| "neighborhood_id", | |
| "building_type_id", | |
| "largest_property_use_type_id", | |
| "primary_property_type_id", | |
| "second_largest_property_use_type_id", | |
| "third_largest_property_use_type_id", | |
| ] | |
| X_train, X_test, y_train, y_test = get_for_model(features) | |
| # Initialisation du modèle | |
| rf = RandomForestRegressor( | |
| n_estimators=500, | |
| max_features=0.5, | |
| random_state=42, | |
| min_samples_split=5, | |
| max_depth=20, | |
| ) | |
| # Entraînement sur l'ensemble des données | |
| X = building_consumption[features] | |
| y = building_consumption["log_" + var_a_predire] | |
| rf.fit(X, y) | |
| # Prédiction | |
| y_pred = rf.predict(X_test) | |
| scores_cross = cross_validate( | |
| rf, X_train, y_train, cv=5 | |
| ) # cv=5 pour une validation croisée à 5 plis | |
| fit_time = scores_cross["fit_time"].mean() | |
| score_time = scores_cross["score_time"].mean() | |
| scores.update( | |
| {"RandomForestRegressor HP": get_score(y_test, y_pred) + (fit_time, score_time)} | |
| ) | |
| resultats = pd.DataFrame(scores).T | |
| resultats.columns = ["RMSE", "MAE", "R^2", "Fit Time", "Score Time"] | |
| resultats = resultats.round(2) | |
| print(resultats[-1:1]["RMSE"].values) | |
| # Sauvegarde du modèle | |
| # Informations du modèle | |
| model_info = { | |
| "model_type": type(rf).__name__, | |
| "model_module": type(rf).__module__, | |
| "has_feature_importances": hasattr(rf, "feature_importances_"), | |
| "has_coefficients": hasattr(rf, "coef_"), | |
| "has_predict_proba": hasattr(rf, "predict_proba"), | |
| "version": "1.0", | |
| "created_at": datetime.now().isoformat(), | |
| "features": list(X_train.columns), | |
| "accuracy": rf.score(X_test, y_test), | |
| "RMSE": resultats[-1:1]["RMSE"].values[0], | |
| "MAE": resultats[-1:1]["MAE"].values[0], | |
| "R^2": resultats[-1:1]["R^2"].values[0], | |
| } | |
| # Sauvegarder modèle et métadonnées | |
| joblib.dump(rf, "model.pkl") | |
| with open("model_info.json", "w") as f: | |
| json.dump(model_info, f, indent=2) | |