Spaces:
Sleeping
Sleeping
File size: 3,887 Bytes
66a0674 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | # This Python file uses the following encoding: utf-8
# Permet de générer le modèle à partir de la base de données
# Reprise et adaptation du notebook du projet3
import json
import os
from datetime import datetime
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_validate, train_test_split
from sqlalchemy import create_engine
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://user:password@localhost/dbname")
def get_for_model(features):
# Separation du jeu de données en train, test
X = building_consumption[features]
y = building_consumption["log_" + var_a_predire]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, shuffle=True
)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
print(f"Shape de X_train: {X_train.shape}")
print(f"Shape de y_train: {y_train.shape}")
print(f"Shape de X_test: {X_test.shape}")
print(f"Shape de y_train: {y_test.shape}")
return X_train, X_test, y_train, y_test
def get_score(y, prediction):
mse = mean_squared_error(y, prediction)
mae = mean_absolute_error(y, prediction)
r2 = r2_score(y, prediction)
return mse, mae, r2
scores = {}
engine = create_engine(DATABASE_URL)
query = "SELECT * FROM model_view"
building_consumption = pd.read_sql(query, engine)
print(building_consumption.info())
engine.dispose()
var_a_predire = "site_energy_use_wn_kbtu"
building_consumption["log_" + var_a_predire] = np.log(
building_consumption[var_a_predire]
)
# Features
features = [
"year_built",
"number_of_buildings",
"number_of_floors",
"property_gfa_total",
"property_gfa_parking",
"second_largest_property_use_type_gfa",
"third_largest_property_use_type_gfa",
"multiusage",
"steam",
"electricity",
"natural_gas",
"neighborhood_id",
"building_type_id",
"largest_property_use_type_id",
"primary_property_type_id",
"second_largest_property_use_type_id",
"third_largest_property_use_type_id",
]
X_train, X_test, y_train, y_test = get_for_model(features)
# Initialisation du modèle
rf = RandomForestRegressor(
n_estimators=500,
max_features=0.5,
random_state=42,
min_samples_split=5,
max_depth=20,
)
# Entraînement sur l'ensemble des données
X = building_consumption[features]
y = building_consumption["log_" + var_a_predire]
rf.fit(X, y)
# Prédiction
y_pred = rf.predict(X_test)
scores_cross = cross_validate(
rf, X_train, y_train, cv=5
) # cv=5 pour une validation croisée à 5 plis
fit_time = scores_cross["fit_time"].mean()
score_time = scores_cross["score_time"].mean()
scores.update(
{"RandomForestRegressor HP": get_score(y_test, y_pred) + (fit_time, score_time)}
)
resultats = pd.DataFrame(scores).T
resultats.columns = ["RMSE", "MAE", "R^2", "Fit Time", "Score Time"]
resultats = resultats.round(2)
print(resultats[-1:1]["RMSE"].values)
# Sauvegarde du modèle
# Informations du modèle
model_info = {
"model_type": type(rf).__name__,
"model_module": type(rf).__module__,
"has_feature_importances": hasattr(rf, "feature_importances_"),
"has_coefficients": hasattr(rf, "coef_"),
"has_predict_proba": hasattr(rf, "predict_proba"),
"version": "1.0",
"created_at": datetime.now().isoformat(),
"features": list(X_train.columns),
"accuracy": rf.score(X_test, y_test),
"RMSE": resultats[-1:1]["RMSE"].values[0],
"MAE": resultats[-1:1]["MAE"].values[0],
"R^2": resultats[-1:1]["R^2"].values[0],
}
# Sauvegarder modèle et métadonnées
joblib.dump(rf, "model.pkl")
with open("model_info.json", "w") as f:
json.dump(model_info, f, indent=2)
|