ArthurGamaJorge commited on
Commit
936b704
·
1 Parent(s): 48facb6

Adicionar arquivos

Browse files
Files changed (3) hide show
  1. Dockerfile +18 -0
  2. api/app.py +243 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ ENV PATH="/home/user/.local/bin:$PATH"
5
+
6
+ WORKDIR /app
7
+
8
+ COPY ./requirements.txt /app/requirements.txt
9
+ RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
10
+
11
+ COPY . /app
12
+ RUN chown -R user:user /app
13
+
14
+ USER user
15
+
16
+ EXPOSE 8000
17
+
18
+ CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"]
api/app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python -m uvicorn app:app --reload
2
+
3
+ from __future__ import annotations
4
+ from typing import Any, Dict, List, Optional
5
+ import os
6
+ import traceback
7
+ import joblib
8
+ import numpy as np
9
+ import pandas as pd
10
+ from fastapi import FastAPI, HTTPException, Request
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from pydantic import BaseModel, Field
13
+
14
+ APP_DIR = os.path.dirname(os.path.abspath(__file__))
15
+ ROOT_DIR = os.path.dirname(APP_DIR)
16
+ MODEL_PATH = os.path.join(ROOT_DIR, "ai", "models", "stacking_fraude_model_4.pkl")
17
+ FEATHER_DATASET = os.path.join(ROOT_DIR, "data", "final_dataset.feather")
18
+ PARQUET_DATASET = os.path.join(ROOT_DIR, "data", "final_dataset.parquet")
19
+
20
+ DROP_COLS = {
21
+ "tx_year",
22
+ "tx_month",
23
+ "periodo",
24
+ "terminal_soft_descriptor",
25
+ "card_hash",
26
+ "card_bin",
27
+ "is_transactional_fraud",
28
+ "is_fraud",
29
+ "cluster",
30
+ "fraude_tipo_extendido",
31
+ }
32
+
33
+
34
+ class TransactionBody(BaseModel):
35
+ features: Dict[str, Any] = Field(default_factory=dict)
36
+
37
+
38
+ class BatchBody(BaseModel):
39
+ items: List[Dict[str, Any]]
40
+
41
+
42
+ _MODEL = None
43
+ _FEATURES: Optional[List[str]] = None
44
+ _CARD_MEDIANS: Dict[str, Dict[str, float]] = {}
45
+ _TERM_MEDIANS: Dict[str, Dict[str, float]] = {}
46
+
47
+
48
+ FRAUD_TYPE_MAP = {
49
+ 0: ("c0", "não é fraude"),
50
+ 1: ("c1", "fraude em cartão"),
51
+ 2: ("c2", "desacordo comercial"),
52
+ 3: ("c3", "fraude no terminal"),
53
+ 4: ("c4", "conluio"),
54
+ }
55
+
56
+ CARD_FEATURES = {
57
+ "dias_desde_primeira_transacao_do_cartao",
58
+ "qtas_transacoes_cartao_dia",
59
+ "qtas_fraudes_cartao",
60
+ "valor_medio_cartao",
61
+ "valor_medio_cartao_3_transacoes",
62
+ "desvio_padrao_valor_cartao",
63
+ "entropia_geografica_cartao",
64
+ "frequencia_transacoes_24h",
65
+ "media_tempo_entre_transacoes",
66
+ "fraude_ratio_cartao",
67
+ "tempo_medio_denuncia_cartao",
68
+ }
69
+
70
+ TERMINAL_FEATURES = {
71
+ "dias_desde_inicio_terminal",
72
+ "qtas_transacoes_terminal_dia",
73
+ "qtas_fraudes_terminal",
74
+ "valor_medio_terminal",
75
+ "media_valor_terminal_semana",
76
+ "fraude_ratio_terminal",
77
+ "tempo_medio_denuncia_terminal",
78
+ }
79
+
80
+
81
+ def _predict(ensemble, X: pd.DataFrame) -> Dict[str, Any]:
82
+ y_pred = ensemble.predict(X)
83
+ y_prob = ensemble.predict_proba(X) if hasattr(ensemble, "predict_proba") else None
84
+ items: List[Dict[str, Any]] = []
85
+ for i in range(len(X)):
86
+ pred_class = int(y_pred[i])
87
+ is_fraud = bool(pred_class != 0)
88
+ probs = None
89
+ if y_prob is not None:
90
+ pp = y_prob[i]
91
+ probs = [float(p) for p in pp]
92
+ code_name = FRAUD_TYPE_MAP.get(pred_class)
93
+ fraud_code = code_name[0] if code_name else None
94
+ fraud_label = code_name[1] if code_name else None
95
+ row = X.iloc[i]
96
+ debug = {c: (float(row[c]) if pd.notna(row[c]) else None) for c in X.columns}
97
+ items.append({
98
+ "predicted_class": pred_class,
99
+ "is_fraud": bool(is_fraud),
100
+ "fraud_type": fraud_code if is_fraud else None,
101
+ "fraud_type_name": fraud_label if is_fraud else None,
102
+ "class_probabilities": probs,
103
+ "_debug_processed_features": debug,
104
+ })
105
+ return {"items": items}
106
+
107
+
108
+ app = FastAPI(title="Unfraud API", version="1.0.0")
109
+ app.add_middleware(
110
+ CORSMiddleware,
111
+ allow_origins=["http://localhost:5173", "http://127.0.0.1:5173", "*"],
112
+ allow_credentials=True,
113
+ allow_methods=["*"],
114
+ allow_headers=["*"],
115
+ )
116
+
117
+
118
+ def _load_model_and_features():
119
+ global _MODEL, _FEATURES
120
+ if _MODEL is None:
121
+ if not os.path.exists(MODEL_PATH):
122
+ raise FileNotFoundError(f"Modelo não encontrado: {MODEL_PATH}")
123
+ _MODEL = joblib.load(MODEL_PATH)
124
+ if _FEATURES is None:
125
+ feat_from_model = getattr(_MODEL, "feature_names_in_", None)
126
+ if feat_from_model is not None:
127
+ _FEATURES = [c for c in list(feat_from_model) if c not in DROP_COLS]
128
+ else:
129
+ if os.path.exists(PARQUET_DATASET):
130
+ df_cols = list(pd.read_parquet(PARQUET_DATASET).columns)
131
+ _FEATURES = [c for c in df_cols if c not in DROP_COLS]
132
+ elif os.path.exists(FEATHER_DATASET):
133
+ df_cols = list(pd.read_feather(FEATHER_DATASET).columns)
134
+ _FEATURES = [c for c in df_cols if c not in DROP_COLS]
135
+ else:
136
+ raise FileNotFoundError("Dataset não encontrado para inferir features")
137
+
138
+
139
+ def _load_dataset(columns: List[str]) -> pd.DataFrame:
140
+ if os.path.exists(PARQUET_DATASET):
141
+ df = pd.read_parquet(PARQUET_DATASET)
142
+ use = [c for c in columns if c in df.columns] if columns else df.columns
143
+ return df[use]
144
+ elif os.path.exists(FEATHER_DATASET):
145
+ df = pd.read_feather(FEATHER_DATASET)
146
+ use = [c for c in columns if c in df.columns] if columns else df.columns
147
+ return df[use]
148
+ else:
149
+ raise FileNotFoundError("Nenhum arquivo de dataset encontrado (.parquet ou .feather)")
150
+
151
+
152
+ def _compute_group_medians():
153
+ global _CARD_MEDIANS, _TERM_MEDIANS
154
+ if _CARD_MEDIANS or _TERM_MEDIANS:
155
+ return
156
+ if _FEATURES is None:
157
+ raise RuntimeError("Features não carregadas")
158
+ df = _load_dataset(list(set(_FEATURES + ["card_hash", "terminal_id"])))
159
+ num_feats = [c for c in _FEATURES if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
160
+ if "card_hash" in df.columns and num_feats:
161
+ g = df.groupby("card_hash")[num_feats].median(numeric_only=True)
162
+ _CARD_MEDIANS = {k: {kk: float(vv) for kk, vv in row.dropna().to_dict().items()} for k, row in g.iterrows()}
163
+ if "terminal_id" in df.columns and num_feats:
164
+ g2 = df.groupby("terminal_id")[num_feats].median(numeric_only=True)
165
+ _TERM_MEDIANS = {k: {kk: float(vv) for kk, vv in row.dropna().to_dict().items()} for k, row in g2.iterrows()}
166
+
167
+
168
+ def _enrich_with_id_medians(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
169
+ if not items:
170
+ return items
171
+ enriched: List[Dict[str, Any]] = []
172
+ for rec in items:
173
+ r = dict(rec)
174
+ ck_raw = rec.get("card_hash")
175
+ tk_raw = rec.get("terminal_id")
176
+ ck = str(ck_raw) if ck_raw is not None else None
177
+ tk = str(tk_raw) if tk_raw is not None else None
178
+ cm = _CARD_MEDIANS.get(ck) if ck is not None else None
179
+ tm = _TERM_MEDIANS.get(tk) if tk is not None else None
180
+ if cm:
181
+ for k, v in cm.items():
182
+ if k in CARD_FEATURES and (k not in r or r[k] in (None, "", "NaN")):
183
+ r[k] = v
184
+ if tm:
185
+ for k, v in tm.items():
186
+ if k in TERMINAL_FEATURES and (k not in r or r[k] in (None, "", "NaN")):
187
+ r[k] = v
188
+ enriched.append(r)
189
+ return enriched
190
+
191
+
192
+ def _ensure_dataframe(records: List[Dict[str, Any]], feature_order: List[str]) -> pd.DataFrame:
193
+ df = pd.DataFrame(records)
194
+ for col in df.columns:
195
+ df[col] = pd.to_numeric(df[col], errors="coerce")
196
+ df = df.reindex(columns=feature_order)
197
+ df = df.fillna(0)
198
+ return df
199
+
200
+
201
+ @app.get("/health")
202
+ def health():
203
+ return {"status": "ok"}
204
+
205
+
206
+ @app.post("/predict")
207
+ def predict_one(body: TransactionBody, request: Request):
208
+ try:
209
+ _load_model_and_features()
210
+ _compute_group_medians()
211
+ assert _FEATURES is not None
212
+ enriched = _enrich_with_id_medians([body.features])
213
+ X = _ensure_dataframe(enriched, _FEATURES)
214
+ output = _predict(_MODEL, X)
215
+ return output["items"][0]
216
+ except Exception as e:
217
+ traceback.print_exc()
218
+ raise HTTPException(status_code=500, detail=str(e))
219
+
220
+
221
+ @app.post("/predict/batch")
222
+ def predict_batch(body: BatchBody, request: Request):
223
+ try:
224
+ if len(body.items) == 0:
225
+ return {"items": []}
226
+ _load_model_and_features()
227
+ _compute_group_medians()
228
+ assert _FEATURES is not None
229
+ enriched = _enrich_with_id_medians(body.items)
230
+ X = _ensure_dataframe(enriched, _FEATURES)
231
+ output = _predict(_MODEL, X)
232
+ return output
233
+ except Exception as e:
234
+ traceback.print_exc()
235
+ raise HTTPException(status_code=500, detail=str(e))
236
+
237
+
238
+ if __name__ == "__main__":
239
+ # When running this module directly, start uvicorn with the `app` object defined in this file.
240
+ # Use reload=True for development; in production it's better to remove reload.
241
+ import uvicorn
242
+ uvicorn.run("api.app:app", host="0.0.0.0", port=8000, reload=True)
243
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pandas
4
+ numpy
5
+ joblib
6
+ scikit-learn
7
+ pyarrow