252106862eder commited on
Commit
3730180
·
verified ·
1 Parent(s): 75555fe

Update model_utils.py

Browse files

atualizando model_utils.py

Files changed (1) hide show
  1. model_utils.py +107 -1
model_utils.py CHANGED
@@ -1 +1,107 @@
1
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model_utils.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
6
+ from sklearn.impute import SimpleImputer
7
+ from sklearn.compose import ColumnTransformer
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.linear_model import LogisticRegression
10
+ from imblearn.over_sampling import SMOTE
11
+ from typing import Dict, Any, List, Tuple, Union
12
+
13
+ # Definir as colunas que seu modelo espera
14
+ ALL_FEATURES = ['Idade', 'Saldo_Conta', 'Numero_Produtos', 'Tempo_Cliente_Meses',
15
+ 'Genero', 'Cidade', 'Renda_Anual_USD', 'Membro_Ativo', 'Tem_Cartao_Credito']
16
+ TARGET_COLUMN = 'Churn'
17
+
18
+ class ChurnModelPipeline:
19
+ def __init__(self):
20
+ self.model = None
21
+ self.preprocessor = None
22
+ self.feature_names_out = None # Nomes das features após o pré-processamento
23
+
24
+ def _build_preprocessor(self, X: pd.DataFrame) -> ColumnTransformer:
25
+ # Identificar features numéricas e categóricas com base no dataframe X
26
+ numeric_features = X.select_dtypes(include=np.number).columns.tolist()
27
+ categorical_features = X.select_dtypes(include='object').columns.tolist()
28
+
29
+ numeric_transformer = Pipeline(steps=[
30
+ ('imputer', SimpleImputer(strategy='median')),
31
+ ('scaler', StandardScaler())
32
+ ])
33
+
34
+ categorical_transformer = Pipeline(steps=[
35
+ ('imputer', SimpleImputer(strategy='most_frequent')),
36
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
37
+ ])
38
+
39
+ preprocessor = ColumnTransformer(
40
+ transformers=[
41
+ ('num', numeric_transformer, numeric_features),
42
+ ('cat', categorical_transformer, categorical_features)
43
+ ],
44
+ remainder='passthrough'
45
+ )
46
+ return preprocessor
47
+
48
+ def train(self, df: pd.DataFrame) -> None:
49
+ X = df[ALL_FEATURES]
50
+ y = df[TARGET_COLUMN]
51
+
52
+ # 1. Dividir em treino e teste (estratificado para Churn)
53
+ X_train, _, y_train, _ = train_test_split(
54
+ X, y, test_size=0.2, random_state=42, stratify=y
55
+ )
56
+
57
+ # 2. Construir e ajustar o pré-processador
58
+ self.preprocessor = self._build_preprocessor(X_train)
59
+ X_train_processed = self.preprocessor.fit_transform(X_train)
60
+
61
+ # Obter nomes das features após o pré-processamento
62
+ numeric_f = X_train.select_dtypes(include=np.number).columns.tolist()
63
+ categorical_f = X_train.select_dtypes(include='object').columns.tolist()
64
+
65
+ self.feature_names_out = numeric_f + list(self.preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_f))
66
+
67
+
68
+ # 3. Balanceamento de Classes com SMOTE
69
+ smote = SMOTE(random_state=42)
70
+ X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
71
+
72
+ # 4. Treinar o modelo
73
+ # Usamos LogisticRegression, similar ao seu relatório
74
+ self.model = LogisticRegression(random_state=42, solver='liblinear', C=0.1, max_iter=500)
75
+ self.model.fit(X_train_resampled, y_train_resampled)
76
+ print("Modelo de Churn treinado com sucesso!")
77
+
78
+ def predict_churn(self, input_data: pd.DataFrame) -> Tuple[int, float]:
79
+ if self.model is None or self.preprocessor is None:
80
+ raise RuntimeError("Modelo ou pré-processador não treinados. Chame .train() primeiro.")
81
+
82
+ # Validar se as colunas necessárias estão presentes
83
+ if not all(col in input_data.columns for col in ALL_FEATURES):
84
+ missing_cols = [col for col in ALL_FEATURES if col not in input_data.columns]
85
+ raise ValueError(f"Dados de entrada brutos não contêm todas as features esperadas: {missing_cols}. Features esperadas: {ALL_FEATURES}")
86
+
87
+ # Garantir a ordem das colunas e que todas as features estejam presentes, mesmo que preenchidas com NaN
88
+ input_data_ordered = input_data[ALL_FEATURES]
89
+
90
+ # Aplicar o mesmo pré-processamento usado no treino
91
+ X_processed = self.preprocessor.transform(input_data_ordered)
92
+
93
+ # Fazer a previsão
94
+ prediction = self.model.predict(X_processed)[0]
95
+ # Obter a probabilidade da classe positiva (Churn=1)
96
+ probability_churn = self.model.predict_proba(X_processed)[0][1]
97
+
98
+ return int(prediction), float(probability_churn)
99
+
100
+ # Funções auxiliares para Gradio
101
+ def get_model_coefficients(model_pipeline: ChurnModelPipeline) -> pd.DataFrame:
102
+ if model_pipeline.model and hasattr(model_pipeline.model, 'coef_') and model_pipeline.feature_names_out:
103
+ coefs = model_pipeline.model.coef_[0] if model_pipeline.model.coef_.ndim > 1 else model_pipeline.model.coef_
104
+ coef_df = pd.DataFrame({'Feature': model_pipeline.feature_names_out, 'Coeficiente': coefs})
105
+ coef_df['Odds_Ratio'] = np.exp(coef_df['Coeficiente'])
106
+ return coef_df.sort_values(by='Odds_Ratio', ascending=False)
107
+ return pd.DataFrame()