brunaaaz commited on
Commit
8fcbded
·
verified ·
1 Parent(s): 75c8f9c

Delete preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +0 -108
preprocess.py DELETED
@@ -1,108 +0,0 @@
1
- # create_demo_data.py - Cria dados de demonstração e modelos base
2
- import pandas as pd
3
- import numpy as np
4
- from sklearn.model_selection import train_test_split
5
- from sklearn.preprocessing import StandardScaler
6
- from sklearn.linear_model import LogisticRegression
7
- from sklearn.neighbors import KNeighborsClassifier
8
- from sklearn.svm import SVC
9
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
10
- import joblib
11
-
12
- def create_demo_dataset():
13
- """Cria dataset de demonstração realístico"""
14
- np.random.seed(42)
15
- n_samples = 2000
16
-
17
- # Features baseadas no dataset real de hotéis
18
- features = {
19
- 'lead_time': np.random.gamma(2, 50, n_samples),
20
- 'adr': np.random.normal(100, 30, n_samples),
21
- 'adults': np.random.poisson(2, n_samples),
22
- 'children': np.random.poisson(0.3, n_samples),
23
- 'previous_cancellations': np.random.poisson(0.1, n_samples),
24
- 'is_repeated_guest': np.random.binomial(1, 0.1, n_samples),
25
- 'required_car_parking_spaces': np.random.binomial(1, 0.2, n_samples),
26
- 'total_of_special_requests': np.random.poisson(0.5, n_samples),
27
- 'booking_changes': np.random.poisson(0.3, n_samples),
28
- }
29
-
30
- X = pd.DataFrame(features)
31
-
32
- # Criar target com relação realística
33
- cancellation_prob = 1 / (1 + np.exp(-(
34
- X['lead_time'] * 0.01 +
35
- X['adr'] * 0.005 -
36
- X['is_repeated_guest'] * 0.8 -
37
- X['required_car_parking_spaces'] * 0.3 +
38
- X['total_of_special_requests'] * -0.4 +
39
- np.random.normal(0, 0.5, n_samples)
40
- )))
41
-
42
- y = (cancellation_prob > 0.5).astype(int)
43
-
44
- return X, y
45
-
46
- def train_and_save_models():
47
- """Treina e salva modelos de demonstração"""
48
- # Criar dados
49
- X, y = create_demo_dataset()
50
-
51
- # Split dos dados
52
- X_train, X_test, y_train, y_test = train_test_split(
53
- X, y, test_size=0.3, random_state=42, stratify=y
54
- )
55
-
56
- # Normalizar
57
- scaler = StandardScaler()
58
- X_train_scaled = scaler.fit_transform(X_train)
59
- X_test_scaled = scaler.transform(X_test)
60
-
61
- # Treinar modelos
62
- models = {}
63
- results = {}
64
-
65
- # Regressão Logística
66
- lr = LogisticRegression(random_state=42, max_iter=1000)
67
- lr.fit(X_train_scaled, y_train)
68
- models['RL_Padrao'] = lr
69
-
70
- # KNN
71
- knn = KNeighborsClassifier(n_neighbors=5)
72
- knn.fit(X_train_scaled, y_train)
73
- models['KNN_Padrao'] = knn
74
-
75
- # SVM
76
- svm = SVC(probability=True, random_state=42)
77
- svm.fit(X_train_scaled, y_train)
78
- models['SVM_Padrao'] = svm
79
-
80
- # Avaliar modelos
81
- for name, model in models.items():
82
- y_pred = model.predict(X_test_scaled)
83
- y_proba = model.predict_proba(X_test_scaled)[:, 1]
84
-
85
- results[name] = {
86
- 'Acurácia': accuracy_score(y_test, y_pred),
87
- 'Precisão': precision_score(y_test, y_pred, zero_division=0),
88
- 'Recall': recall_score(y_test, y_pred, zero_division=0),
89
- 'F1-Score': f1_score(y_test, y_pred, zero_division=0),
90
- 'AUC-ROC': roc_auc_score(y_test, y_proba),
91
- 'Tempo Treino (s)': 0
92
- }
93
-
94
- # Salvar dados
95
- data_to_save = {
96
- 'models': models,
97
- 'X_train': X_train_scaled,
98
- 'X_test': X_test_scaled,
99
- 'y_train': y_train,
100
- 'y_test': y_test,
101
- 'results': results
102
- }
103
-
104
- joblib.dump(data_to_save, 'modelos_treinados.pkl')
105
- print("✅ Dados de demonstração e modelos salvos em 'modelos_treinados.pkl'")
106
-
107
- if __name__ == "__main__":
108
- train_and_save_models()