teste2siep3_ / create_demo_data.py
brunaaaz's picture
Create create_demo_data.py
419e5f5 verified
# create_demo_data.py - Cria dados de demonstração e modelos base
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
def create_demo_dataset():
"""Cria dataset de demonstração realístico"""
np.random.seed(42)
n_samples = 2000
# Features baseadas no dataset real de hotéis
features = {
'lead_time': np.random.gamma(2, 50, n_samples),
'adr': np.random.normal(100, 30, n_samples),
'adults': np.random.poisson(2, n_samples),
'children': np.random.poisson(0.3, n_samples),
'previous_cancellations': np.random.poisson(0.1, n_samples),
'is_repeated_guest': np.random.binomial(1, 0.1, n_samples),
'required_car_parking_spaces': np.random.binomial(1, 0.2, n_samples),
'total_of_special_requests': np.random.poisson(0.5, n_samples),
'booking_changes': np.random.poisson(0.3, n_samples),
}
X = pd.DataFrame(features)
# Criar target com relação realística
cancellation_prob = 1 / (1 + np.exp(-(
X['lead_time'] * 0.01 +
X['adr'] * 0.005 -
X['is_repeated_guest'] * 0.8 -
X['required_car_parking_spaces'] * 0.3 +
X['total_of_special_requests'] * -0.4 +
np.random.normal(0, 0.5, n_samples)
)))
y = (cancellation_prob > 0.5).astype(int)
return X, y
def train_and_save_models():
"""Treina e salva modelos de demonstração"""
# Criar dados
X, y = create_demo_dataset()
# Split dos dados
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
# Normalizar
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Treinar modelos
models = {}
results = {}
# Regressão Logística
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
models['RL_Padrao'] = lr
# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
models['KNN_Padrao'] = knn
# SVM
svm = SVC(probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
models['SVM_Padrao'] = svm
# Avaliar modelos
for name, model in models.items():
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]
results[name] = {
'Acurácia': accuracy_score(y_test, y_pred),
'Precisão': precision_score(y_test, y_pred, zero_division=0),
'Recall': recall_score(y_test, y_pred, zero_division=0),
'F1-Score': f1_score(y_test, y_pred, zero_division=0),
'AUC-ROC': roc_auc_score(y_test, y_proba),
'Tempo Treino (s)': 0
}
# Salvar dados
data_to_save = {
'models': models,
'X_train': X_train_scaled,
'X_test': X_test_scaled,
'y_train': y_train,
'y_test': y_test,
'results': results
}
joblib.dump(data_to_save, 'modelos_treinados.pkl')
print("✅ Dados de demonstração e modelos salvos em 'modelos_treinados.pkl'")
if __name__ == "__main__":
train_and_save_models()