252106862eder commited on
Commit
6be7ae6
·
verified ·
1 Parent(s): e716a96

Update model_utils.py

Browse files

atualizada a base com modelo da tarefa

Files changed (1) hide show
  1. model_utils.py +34 -17
model_utils.py CHANGED
@@ -1,4 +1,5 @@
1
- # model_utils.py
 
2
  import pandas as pd
3
  import numpy as np
4
  from sklearn.model_selection import train_test_split
@@ -10,10 +11,12 @@ from sklearn.linear_model import LogisticRegression
10
  from imblearn.over_sampling import SMOTE
11
  from typing import Dict, Any, List, Tuple, Union
12
 
13
- # Definir as colunas que seu modelo espera
14
- ALL_FEATURES = ['Idade', 'Saldo_Conta', 'Numero_Produtos', 'Tempo_Cliente_Meses',
15
- 'Genero', 'Cidade', 'Renda_Anual_USD', 'Membro_Ativo', 'Tem_Cartao_Credito']
16
- TARGET_COLUMN = 'Churn'
 
 
17
 
18
  class ChurnModelPipeline:
19
  def __init__(self):
@@ -23,8 +26,9 @@ class ChurnModelPipeline:
23
 
24
  def _build_preprocessor(self, X: pd.DataFrame) -> ColumnTransformer:
25
  # Identificar features numéricas e categóricas com base no dataframe X
26
- numeric_features = X.select_dtypes(include=np.number).columns.tolist()
27
- categorical_features = X.select_dtypes(include='object').columns.tolist()
 
28
 
29
  numeric_transformer = Pipeline(steps=[
30
  ('imputer', SimpleImputer(strategy='median')),
@@ -41,36 +45,49 @@ class ChurnModelPipeline:
41
  ('num', numeric_transformer, numeric_features),
42
  ('cat', categorical_transformer, categorical_features)
43
  ],
44
- remainder='passthrough'
45
  )
46
  return preprocessor
47
 
48
  def train(self, df: pd.DataFrame) -> None:
 
 
 
 
 
 
 
 
49
  X = df[ALL_FEATURES]
50
  y = df[TARGET_COLUMN]
51
 
52
- # 1. Dividir em treino e teste (estratificado para Churn)
 
53
  X_train, _, y_train, _ = train_test_split(
54
  X, y, test_size=0.2, random_state=42, stratify=y
55
  )
 
 
56
 
57
  # 2. Construir e ajustar o pré-processador
58
  self.preprocessor = self._build_preprocessor(X_train)
59
  X_train_processed = self.preprocessor.fit_transform(X_train)
60
 
61
- # Obter nomes das features após o pré-processamento
62
- numeric_f = X_train.select_dtypes(include=np.number).columns.tolist()
63
- categorical_f = X_train.select_dtypes(include='object').columns.tolist()
64
 
 
65
  self.feature_names_out = numeric_f + list(self.preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_f))
 
66
 
67
 
68
  # 3. Balanceamento de Classes com SMOTE
69
  smote = SMOTE(random_state=42)
70
  X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
 
71
 
72
- # 4. Treinar o modelo
73
- # Usamos LogisticRegression, similar ao seu relatório
74
  self.model = LogisticRegression(random_state=42, solver='liblinear', C=0.1, max_iter=500)
75
  self.model.fit(X_train_resampled, y_train_resampled)
76
  print("Modelo de Churn treinado com sucesso!")
@@ -84,7 +101,7 @@ class ChurnModelPipeline:
84
  missing_cols = [col for col in ALL_FEATURES if col not in input_data.columns]
85
  raise ValueError(f"Dados de entrada brutos não contêm todas as features esperadas: {missing_cols}. Features esperadas: {ALL_FEATURES}")
86
 
87
- # Garantir a ordem das colunas e que todas as features estejam presentes, mesmo que preenchidas com NaN
88
  input_data_ordered = input_data[ALL_FEATURES]
89
 
90
  # Aplicar o mesmo pré-processamento usado no treino
@@ -92,12 +109,12 @@ class ChurnModelPipeline:
92
 
93
  # Fazer a previsão
94
  prediction = self.model.predict(X_processed)[0]
95
- # Obter a probabilidade da classe positiva (Churn=1)
96
  probability_churn = self.model.predict_proba(X_processed)[0][1]
97
 
98
  return int(prediction), float(probability_churn)
99
 
100
- # Funções auxiliares para Gradio
101
  def get_model_coefficients(model_pipeline: ChurnModelPipeline) -> pd.DataFrame:
102
  if model_pipeline.model and hasattr(model_pipeline.model, 'coef_') and model_pipeline.feature_names_out:
103
  coefs = model_pipeline.model.coef_[0] if model_pipeline.model.coef_.ndim > 1 else model_pipeline.model.coef_
 
1
+ # model_utils.py - ATUALIZADO
2
+
3
  import pandas as pd
4
  import numpy as np
5
  from sklearn.model_selection import train_test_split
 
11
  from imblearn.over_sampling import SMOTE
12
  from typing import Dict, Any, List, Tuple, Union
13
 
14
+ # --- DEFINIÇÃO DAS FEATURES E COLUNA ALVO PARA SEU data.csv ---
15
+ ALL_FEATURES = [
16
+ 'creditscore', 'geography', 'gender', 'age', 'tenure',
17
+ 'balance', 'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary'
18
+ ]
19
+ TARGET_COLUMN = 'exited' # Sua coluna alvo agora é 'exited'
20
 
21
  class ChurnModelPipeline:
22
  def __init__(self):
 
26
 
27
  def _build_preprocessor(self, X: pd.DataFrame) -> ColumnTransformer:
28
  # Identificar features numéricas e categóricas com base no dataframe X
29
+ # A seleção é feita a partir das ALL_FEATURES definidas
30
+ numeric_features = X[ALL_FEATURES].select_dtypes(include=np.number).columns.tolist()
31
+ categorical_features = X[ALL_FEATURES].select_dtypes(include='object').columns.tolist()
32
 
33
  numeric_transformer = Pipeline(steps=[
34
  ('imputer', SimpleImputer(strategy='median')),
 
45
  ('num', numeric_transformer, numeric_features),
46
  ('cat', categorical_transformer, categorical_features)
47
  ],
48
+ remainder='passthrough' # Manter colunas não usadas se houver (e.g., customerid, surname)
49
  )
50
  return preprocessor
51
 
52
  def train(self, df: pd.DataFrame) -> None:
53
+ print(f"Iniciando treinamento com {len(df)} linhas e features: {ALL_FEATURES}")
54
+ print(f"Coluna alvo: {TARGET_COLUMN}")
55
+
56
+ # Validar se todas as ALL_FEATURES e TARGET_COLUMN existem no DataFrame
57
+ missing_cols = [col for col in ALL_FEATURES + [TARGET_COLUMN] if col not in df.columns]
58
+ if missing_cols:
59
+ raise ValueError(f"Colunas ausentes no DataFrame: {missing_cols}. Verifique seu 'data.csv'.")
60
+
61
  X = df[ALL_FEATURES]
62
  y = df[TARGET_COLUMN]
63
 
64
+ # 1. Dividir em treino e teste (estratificado para a coluna 'exited')
65
+ # Para datasets pequenos como o seu exemplo, test_size=0.2 é um bom balanceamento
66
  X_train, _, y_train, _ = train_test_split(
67
  X, y, test_size=0.2, random_state=42, stratify=y
68
  )
69
+ print(f"X_train shape antes do pre-processamento: {X_train.shape}")
70
+ print(f"y_train value counts antes do SMOTE: {y_train.value_counts()}")
71
 
72
  # 2. Construir e ajustar o pré-processador
73
  self.preprocessor = self._build_preprocessor(X_train)
74
  X_train_processed = self.preprocessor.fit_transform(X_train)
75
 
76
+ # Obter nomes das features após one-hot encoding
77
+ numeric_f = X_train[ALL_FEATURES].select_dtypes(include=np.number).columns.tolist()
78
+ categorical_f = X_train[ALL_FEATURES].select_dtypes(include='object').columns.tolist()
79
 
80
+ # get_feature_names_out é mais robusto para ColumnTransformer
81
  self.feature_names_out = numeric_f + list(self.preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_f))
82
+ print(f"X_train_processed shape após pre-processamento: {X_train_processed.shape}")
83
 
84
 
85
  # 3. Balanceamento de Classes com SMOTE
86
  smote = SMOTE(random_state=42)
87
  X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
88
+ print(f"y_train_resampled value counts após SMOTE: {y_train_resampled.value_counts()}")
89
 
90
+ # 4. Treinar o modelo (Logistic Regression)
 
91
  self.model = LogisticRegression(random_state=42, solver='liblinear', C=0.1, max_iter=500)
92
  self.model.fit(X_train_resampled, y_train_resampled)
93
  print("Modelo de Churn treinado com sucesso!")
 
101
  missing_cols = [col for col in ALL_FEATURES if col not in input_data.columns]
102
  raise ValueError(f"Dados de entrada brutos não contêm todas as features esperadas: {missing_cols}. Features esperadas: {ALL_FEATURES}")
103
 
104
+ # Garantir a ordem das colunas e que todas as features estejam presentes
105
  input_data_ordered = input_data[ALL_FEATURES]
106
 
107
  # Aplicar o mesmo pré-processamento usado no treino
 
109
 
110
  # Fazer a previsão
111
  prediction = self.model.predict(X_processed)[0]
112
+ # Obter a probabilidade da classe positiva (exited=1)
113
  probability_churn = self.model.predict_proba(X_processed)[0][1]
114
 
115
  return int(prediction), float(probability_churn)
116
 
117
+ # Funções auxiliares para Gradio (ainda não usadas na UI, mas úteis)
118
  def get_model_coefficients(model_pipeline: ChurnModelPipeline) -> pd.DataFrame:
119
  if model_pipeline.model and hasattr(model_pipeline.model, 'coef_') and model_pipeline.feature_names_out:
120
  coefs = model_pipeline.model.coef_[0] if model_pipeline.model.coef_.ndim > 1 else model_pipeline.model.coef_