Spaces:
Running
Running
sghorbal commited on
Commit ·
51c1df4
1
Parent(s): 8b78aa0
remove unused files
Browse files- src/model.py +0 -316
- src/service/model.py +0 -241
- tests/test_model.py +0 -23
src/model.py
DELETED
|
@@ -1,316 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import time
|
| 3 |
-
import joblib
|
| 4 |
-
import logging
|
| 5 |
-
import pandas as pd
|
| 6 |
-
from dotenv import load_dotenv
|
| 7 |
-
from typing import Literal, Any, Tuple, Dict, List
|
| 8 |
-
import mlflow
|
| 9 |
-
from mlflow.models import infer_signature
|
| 10 |
-
from mlflow.tracking import MlflowClient
|
| 11 |
-
from sklearn.model_selection import train_test_split
|
| 12 |
-
from sklearn.impute import SimpleImputer
|
| 13 |
-
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
| 14 |
-
from sklearn.compose import ColumnTransformer
|
| 15 |
-
from sklearn.linear_model import LogisticRegression
|
| 16 |
-
from sklearn.pipeline import Pipeline
|
| 17 |
-
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
|
| 18 |
-
|
| 19 |
-
from src.repository.sql import load_matches_from_postgres
|
| 20 |
-
from src.enums import Feature
|
| 21 |
-
|
| 22 |
-
load_dotenv()
|
| 23 |
-
|
| 24 |
-
models = {}
|
| 25 |
-
|
| 26 |
-
def create_pairwise_data(df: pd.DataFrame) -> pd.DataFrame:
|
| 27 |
-
"""
|
| 28 |
-
Creates a balanced dataset with pairwise comparisons
|
| 29 |
-
"""
|
| 30 |
-
records = []
|
| 31 |
-
for _, row in df.iterrows():
|
| 32 |
-
# Record 1 : original order (winner in position 1, loser in position 2)
|
| 33 |
-
record_1 = {
|
| 34 |
-
Feature.SERIES.name: row['series'],
|
| 35 |
-
Feature.SURFACE.name: row['surface'],
|
| 36 |
-
Feature.COURT.name: row['court'],
|
| 37 |
-
Feature.ROUND.name: row['round'],
|
| 38 |
-
Feature.DIFF_RANKING.name: row['w_rank'] - row['l_rank'], # rank difference
|
| 39 |
-
Feature.DIFF_POINTS.name: row['w_points'] - row['l_points'], # points difference
|
| 40 |
-
'target': 1 # Player in first position won
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
# Record 2 : invert players
|
| 44 |
-
record_2 = record_1.copy()
|
| 45 |
-
record_2[Feature.DIFF_RANKING.name] = -record_2['diffRanking'] # Invert the ranking difference
|
| 46 |
-
record_2[Feature.DIFF_POINTS.name] = -record_2['diffPoints'] # Invert the points difference
|
| 47 |
-
record_2['target'] = 0 # Player in first position lost
|
| 48 |
-
|
| 49 |
-
records.append(record_1)
|
| 50 |
-
records.append(record_2)
|
| 51 |
-
|
| 52 |
-
return pd.DataFrame(records)
|
| 53 |
-
|
| 54 |
-
def create_pipeline() -> Pipeline:
|
| 55 |
-
"""
|
| 56 |
-
Creates a machine learning pipeline with SimpleImputer, StandardScaler, OneHotEncoder and LogisticRegression.
|
| 57 |
-
|
| 58 |
-
Returns:
|
| 59 |
-
Pipeline: A scikit-learn pipeline object.
|
| 60 |
-
"""
|
| 61 |
-
# Define the features, numerical and categorical
|
| 62 |
-
cat_features = [f.name for f in Feature.get_features_by_type('category')]
|
| 63 |
-
num_features = [f.name for f in Feature.get_features_by_type('number')]
|
| 64 |
-
|
| 65 |
-
# Pipeline for numerical variables
|
| 66 |
-
num_transformer = Pipeline(steps=[
|
| 67 |
-
('imputer', SimpleImputer(strategy='mean')),
|
| 68 |
-
('scaler', StandardScaler())
|
| 69 |
-
])
|
| 70 |
-
|
| 71 |
-
# Pipeline for categorical variables
|
| 72 |
-
cat_transformer = OneHotEncoder(handle_unknown='ignore')
|
| 73 |
-
|
| 74 |
-
# Preprocessor
|
| 75 |
-
preprocessor = ColumnTransformer(
|
| 76 |
-
transformers=[
|
| 77 |
-
('num', num_transformer, num_features),
|
| 78 |
-
('cat', cat_transformer, cat_features)
|
| 79 |
-
]
|
| 80 |
-
)
|
| 81 |
-
|
| 82 |
-
# Full pipeline
|
| 83 |
-
pipeline = Pipeline(steps=[
|
| 84 |
-
('preprocessor', preprocessor),
|
| 85 |
-
('classifier', LogisticRegression(solver='lbfgs', max_iter=1000))
|
| 86 |
-
])
|
| 87 |
-
|
| 88 |
-
return pipeline
|
| 89 |
-
|
| 90 |
-
def train_model_from_scratch(
|
| 91 |
-
circuit: Literal['atp', 'wta'],
|
| 92 |
-
from_date: str,
|
| 93 |
-
to_date: str,
|
| 94 |
-
output_path: str = '/data/model.pkl') -> Pipeline:
|
| 95 |
-
"""
|
| 96 |
-
Train a model from scratch
|
| 97 |
-
"""
|
| 98 |
-
# Load data
|
| 99 |
-
data = load_matches_from_postgres(
|
| 100 |
-
table_name=f"{circuit}_data",
|
| 101 |
-
from_date=from_date,
|
| 102 |
-
to_date=to_date)
|
| 103 |
-
|
| 104 |
-
# Train the model
|
| 105 |
-
pipeline = create_and_train_model(data)
|
| 106 |
-
|
| 107 |
-
# Save the model
|
| 108 |
-
joblib.dump(pipeline, output_path)
|
| 109 |
-
|
| 110 |
-
return pipeline
|
| 111 |
-
|
| 112 |
-
def create_and_train_model(data: pd.DataFrame) -> Pipeline:
|
| 113 |
-
"""
|
| 114 |
-
Create and train a model on the given data
|
| 115 |
-
"""
|
| 116 |
-
# Split the data
|
| 117 |
-
X_train, _, y_train, _ = preprocess_data(data)
|
| 118 |
-
|
| 119 |
-
# Train the model
|
| 120 |
-
pipeline = create_pipeline()
|
| 121 |
-
pipeline = train_model(pipeline, X_train, y_train)
|
| 122 |
-
|
| 123 |
-
return pipeline
|
| 124 |
-
|
| 125 |
-
def train_model(
|
| 126 |
-
pipeline: Pipeline,
|
| 127 |
-
X_train: pd.DataFrame,
|
| 128 |
-
y_train: pd.DataFrame) -> Pipeline:
|
| 129 |
-
"""
|
| 130 |
-
Train the pipeline
|
| 131 |
-
"""
|
| 132 |
-
pipeline.fit(X_train, y_train)
|
| 133 |
-
return pipeline
|
| 134 |
-
|
| 135 |
-
def preprocess_data(df: pd.DataFrame) -> Tuple:
|
| 136 |
-
"""
|
| 137 |
-
Split the dataframe into X (features) and y (target).
|
| 138 |
-
|
| 139 |
-
Args:
|
| 140 |
-
df (pd.DataFrame): Input dataframe.
|
| 141 |
-
|
| 142 |
-
Returns:
|
| 143 |
-
Tuple: Split data (X_train, X_test, y_train, y_test).
|
| 144 |
-
"""
|
| 145 |
-
# Format data for the model
|
| 146 |
-
df_model = create_pairwise_data(df)
|
| 147 |
-
|
| 148 |
-
features = [f.name for f in Feature.get_all_features()]
|
| 149 |
-
X = df_model[features]
|
| 150 |
-
y = df_model['target']
|
| 151 |
-
|
| 152 |
-
# Split the data
|
| 153 |
-
return train_test_split(X, y, test_size=0.2)
|
| 154 |
-
|
| 155 |
-
def evaluate_model(pipeline: Pipeline, X_test: pd.DataFrame, y_test: pd.Series) -> Dict:
|
| 156 |
-
"""
|
| 157 |
-
Evaluates the model
|
| 158 |
-
"""
|
| 159 |
-
y_pred = pipeline.predict(X_test)
|
| 160 |
-
accuracy = accuracy_score(y_test, y_pred)
|
| 161 |
-
roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
|
| 162 |
-
cm = confusion_matrix(y_test, y_pred)
|
| 163 |
-
|
| 164 |
-
return {
|
| 165 |
-
"accuracy": accuracy,
|
| 166 |
-
"roc_auc": roc_auc,
|
| 167 |
-
"confusion_matrix": cm
|
| 168 |
-
}
|
| 169 |
-
|
| 170 |
-
def predict(
|
| 171 |
-
pipeline: Pipeline,
|
| 172 |
-
series: str,
|
| 173 |
-
surface: str,
|
| 174 |
-
court: str,
|
| 175 |
-
round_stage: str,
|
| 176 |
-
rank_player_1: int,
|
| 177 |
-
rank_player_2: int,
|
| 178 |
-
points_player_1: int,
|
| 179 |
-
points_player_2: int
|
| 180 |
-
) -> Dict[str, Any]:
|
| 181 |
-
diffRanking = rank_player_1 - rank_player_2
|
| 182 |
-
diffPoints = points_player_1 - points_player_2
|
| 183 |
-
|
| 184 |
-
# Built a DataFrame with the new match
|
| 185 |
-
new_match = pd.DataFrame([{
|
| 186 |
-
Feature.SERIES.name: series,
|
| 187 |
-
Feature.SURFACE.name: surface,
|
| 188 |
-
Feature.COURT.name: court,
|
| 189 |
-
Feature.ROUND.name: round_stage,
|
| 190 |
-
Feature.DIFF_RANKING.name: diffRanking,
|
| 191 |
-
Feature.DIFF_POINTS.name: diffPoints
|
| 192 |
-
}])
|
| 193 |
-
|
| 194 |
-
# Use the pipeline to make a prediction
|
| 195 |
-
prediction = pipeline.predict(new_match)[0]
|
| 196 |
-
proba = pipeline.predict_proba(new_match)[0]
|
| 197 |
-
|
| 198 |
-
# Print the result
|
| 199 |
-
logging.info("\n--- 📊 Result ---")
|
| 200 |
-
logging.info(f"🏆 Win probability : {proba[1]:.2f}")
|
| 201 |
-
logging.info(f"❌ Lose probability : {proba[0]:.2f}")
|
| 202 |
-
logging.info(f"🎾 Prediction : {'Victory' if prediction == 1 else 'Loss'}")
|
| 203 |
-
|
| 204 |
-
return {"result": prediction.item(), "prob": [p.item() for p in proba]}
|
| 205 |
-
|
| 206 |
-
def run_experiment(
|
| 207 |
-
circuit: Literal['atp', 'wta'],
|
| 208 |
-
from_date: str,
|
| 209 |
-
to_date: str,
|
| 210 |
-
artifact_path: str = None,
|
| 211 |
-
registered_model_name: str = 'LogisticRegression',
|
| 212 |
-
experiment_name: str = 'Logistic Tennis Prediction',
|
| 213 |
-
):
|
| 214 |
-
"""
|
| 215 |
-
Run the entire ML experiment pipeline.
|
| 216 |
-
|
| 217 |
-
Args:
|
| 218 |
-
experiment_name (str): Name of the MLflow experiment.
|
| 219 |
-
data_url (str): URL to load the dataset.
|
| 220 |
-
artifact_path (str): Path to store the model artifact.
|
| 221 |
-
registered_model_name (str): Name to register the model under in MLflow.
|
| 222 |
-
"""
|
| 223 |
-
if not artifact_path:
|
| 224 |
-
artifact_path = f'{circuit}_model'
|
| 225 |
-
|
| 226 |
-
# Set tracking URI to your mlflow application
|
| 227 |
-
mlflow.set_tracking_uri(os.environ["MLFLOW_SERVER_URI"])
|
| 228 |
-
|
| 229 |
-
# Start timing
|
| 230 |
-
start_time = time.time()
|
| 231 |
-
|
| 232 |
-
# Load and preprocess data
|
| 233 |
-
df = load_matches_from_postgres(
|
| 234 |
-
table_name=f"{circuit}_data",
|
| 235 |
-
from_date=from_date,
|
| 236 |
-
to_date=to_date)
|
| 237 |
-
X_train, X_test, y_train, y_test = preprocess_data(df)
|
| 238 |
-
|
| 239 |
-
# Create pipeline
|
| 240 |
-
pipe = create_pipeline()
|
| 241 |
-
|
| 242 |
-
# Set experiment's info
|
| 243 |
-
mlflow.set_experiment(experiment_name)
|
| 244 |
-
|
| 245 |
-
# Get our experiment info
|
| 246 |
-
experiment = mlflow.get_experiment_by_name(experiment_name)
|
| 247 |
-
|
| 248 |
-
# Call mlflow autolog
|
| 249 |
-
mlflow.sklearn.autolog()
|
| 250 |
-
|
| 251 |
-
with mlflow.start_run(experiment_id=experiment.experiment_id):
|
| 252 |
-
# Train model
|
| 253 |
-
train_model(pipe, X_train, y_train)
|
| 254 |
-
|
| 255 |
-
# Store metrics
|
| 256 |
-
# predicted_output = pipe.predict(X_test.values)
|
| 257 |
-
accuracy = pipe.score(X_test, y_test)
|
| 258 |
-
|
| 259 |
-
# Print results
|
| 260 |
-
logging.info("LogisticRegression model")
|
| 261 |
-
logging.info("Accuracy: {}".format(accuracy))
|
| 262 |
-
signature = infer_signature(X_test, pipe.predict(X_test))
|
| 263 |
-
|
| 264 |
-
mlflow.sklearn.log_model(
|
| 265 |
-
sk_model=pipe,
|
| 266 |
-
artifact_path=artifact_path,
|
| 267 |
-
registered_model_name=registered_model_name,
|
| 268 |
-
signature=signature
|
| 269 |
-
)
|
| 270 |
-
|
| 271 |
-
# Print timing
|
| 272 |
-
logging.info(f"...Training Done! --- Total training time: {time.time() - start_time} seconds")
|
| 273 |
-
|
| 274 |
-
def list_registered_models() -> List[Dict]:
|
| 275 |
-
"""
|
| 276 |
-
List all the registered models
|
| 277 |
-
"""
|
| 278 |
-
# Set tracking URI to your Heroku application
|
| 279 |
-
tracking_uri = os.environ.get("MLFLOW_SERVER_URI")
|
| 280 |
-
if tracking_uri is None:
|
| 281 |
-
raise ValueError("MLFLOW_SERVER_URI environment variable is not set.")
|
| 282 |
-
|
| 283 |
-
client = MlflowClient(tracking_uri=tracking_uri)
|
| 284 |
-
# Should be:
|
| 285 |
-
# results = client.search_registered_models()
|
| 286 |
-
# but this is not working from inside the container
|
| 287 |
-
# so we need to use the store client to get the registered models
|
| 288 |
-
results = client._get_registry_client().store.search_registered_models()
|
| 289 |
-
|
| 290 |
-
output = []
|
| 291 |
-
for res in results:
|
| 292 |
-
for mv in res.latest_versions:
|
| 293 |
-
output.append({"name": mv.name, "run_id": mv.run_id, "version": mv.version})
|
| 294 |
-
|
| 295 |
-
return output
|
| 296 |
-
|
| 297 |
-
def load_model(name: str, version: str = 'latest') -> Pipeline:
|
| 298 |
-
"""
|
| 299 |
-
Load a model from MLflow
|
| 300 |
-
"""
|
| 301 |
-
if name in models.keys():
|
| 302 |
-
return models[name]
|
| 303 |
-
|
| 304 |
-
mlflow.set_tracking_uri(os.environ["MLFLOW_SERVER_URI"])
|
| 305 |
-
client = MlflowClient()
|
| 306 |
-
|
| 307 |
-
model_info = client.get_registered_model(name)
|
| 308 |
-
|
| 309 |
-
# Load the model
|
| 310 |
-
pipeline = mlflow.sklearn.load_model(model_uri=model_info.latest_versions[0].source)
|
| 311 |
-
|
| 312 |
-
logging.info(f'Model {name} loaded')
|
| 313 |
-
|
| 314 |
-
models[name] = pipeline
|
| 315 |
-
|
| 316 |
-
return pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/service/model.py
DELETED
|
@@ -1,241 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import pandas as pd
|
| 3 |
-
from typing import Optional, Tuple, Dict, Literal, Any
|
| 4 |
-
from sklearn.model_selection import train_test_split
|
| 5 |
-
from sklearn.pipeline import Pipeline
|
| 6 |
-
from sklearn.impute import SimpleImputer
|
| 7 |
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
| 8 |
-
from sklearn.compose import ColumnTransformer
|
| 9 |
-
from sklearn.metrics import (
|
| 10 |
-
accuracy_score,
|
| 11 |
-
confusion_matrix,
|
| 12 |
-
f1_score,
|
| 13 |
-
roc_auc_score,
|
| 14 |
-
classification_report
|
| 15 |
-
)
|
| 16 |
-
from src.enums import Feature
|
| 17 |
-
from src.repository.sql import load_matches_from_postgres
|
| 18 |
-
# from src.entity.model import Model
|
| 19 |
-
|
| 20 |
-
logger = logging.getLogger(__name__)
|
| 21 |
-
|
| 22 |
-
def preprocess_data(df: pd.DataFrame, test_size: float = 0.2) -> Tuple:
|
| 23 |
-
"""
|
| 24 |
-
Split the dataframe into X (features) and y (target).
|
| 25 |
-
|
| 26 |
-
Args:
|
| 27 |
-
df (pd.DataFrame): Input dataframe.
|
| 28 |
-
|
| 29 |
-
Returns:
|
| 30 |
-
Tuple: Split data (X_train, X_test, y_train, y_test).
|
| 31 |
-
"""
|
| 32 |
-
# Format data for the model
|
| 33 |
-
df_model = df
|
| 34 |
-
|
| 35 |
-
features = [f.name for f in Feature.get_all_features() if f not in [Feature.DIFF_POINTS, Feature.ROUND]]
|
| 36 |
-
X = df_model[features]
|
| 37 |
-
y = df_model['target']
|
| 38 |
-
|
| 39 |
-
# Split the data
|
| 40 |
-
if test_size > 0:
|
| 41 |
-
return train_test_split(X, y, test_size=test_size, stratify=df_model.target, random_state=42)
|
| 42 |
-
else:
|
| 43 |
-
return X, pd.DataFrame(), y, pd.DataFrame()
|
| 44 |
-
|
| 45 |
-
def train_model(
|
| 46 |
-
pipeline: Pipeline,
|
| 47 |
-
X_train: pd.DataFrame,
|
| 48 |
-
y_train: pd.DataFrame) -> Pipeline:
|
| 49 |
-
"""
|
| 50 |
-
Train the pipeline
|
| 51 |
-
"""
|
| 52 |
-
# Start the timer
|
| 53 |
-
import time
|
| 54 |
-
start_time = time.time()
|
| 55 |
-
print("Training the model...")
|
| 56 |
-
pipeline.fit(X_train, y_train)
|
| 57 |
-
print(f"Model trained in {time.time() - start_time:.2f} seconds")
|
| 58 |
-
return pipeline
|
| 59 |
-
|
| 60 |
-
all_algorithms = Literal[
|
| 61 |
-
'XGBoost',
|
| 62 |
-
'RandomForest',
|
| 63 |
-
'SVM',
|
| 64 |
-
'GradientBoosting',
|
| 65 |
-
'MLP',
|
| 66 |
-
'LightGBM',
|
| 67 |
-
'XGBRF',
|
| 68 |
-
'DecisionTree',
|
| 69 |
-
'ExtraTrees',
|
| 70 |
-
'Bagging',
|
| 71 |
-
]
|
| 72 |
-
|
| 73 |
-
def create_and_train_model(data: pd.DataFrame,
|
| 74 |
-
evaluate: bool = False,
|
| 75 |
-
algo: all_algorithms = 'MLP') -> Pipeline:
|
| 76 |
-
"""
|
| 77 |
-
Create and train a model on the given data
|
| 78 |
-
"""
|
| 79 |
-
if evaluate:
|
| 80 |
-
test_size = 0.2
|
| 81 |
-
else:
|
| 82 |
-
test_size = 0.0
|
| 83 |
-
|
| 84 |
-
# Split the data
|
| 85 |
-
X_train, X_test, y_train, y_test = preprocess_data(df=data, test_size=test_size)
|
| 86 |
-
|
| 87 |
-
# Train the model
|
| 88 |
-
pipeline = create_pipeline(algo)
|
| 89 |
-
pipeline = train_model(pipeline, X_train, y_train)
|
| 90 |
-
|
| 91 |
-
if evaluate:
|
| 92 |
-
evaluation_results = evaluate_model(pipeline, X_test, y_test)
|
| 93 |
-
logging.info(f"Evaluation results for {algo}:")
|
| 94 |
-
logging.info(f"F1 Score: {evaluation_results['f1_score']}\n")
|
| 95 |
-
logging.info(f"Confusion Matrix:\n{evaluation_results['confusion_matrix']}\n")
|
| 96 |
-
logging.info(f"ROC AUC: {evaluation_results['roc_auc']}\n")
|
| 97 |
-
logging.info(f"Classification Report:\n{evaluation_results['classification_report']}\n")
|
| 98 |
-
|
| 99 |
-
return pipeline
|
| 100 |
-
|
| 101 |
-
def evaluate_model(pipeline: Pipeline, X_test: pd.DataFrame, y_test: pd.Series) -> Dict:
|
| 102 |
-
"""
|
| 103 |
-
Evaluates the model
|
| 104 |
-
"""
|
| 105 |
-
y_pred = pipeline.predict(X_test)
|
| 106 |
-
accuracy = accuracy_score(y_test, y_pred)
|
| 107 |
-
f1 = f1_score(y_test, y_pred)
|
| 108 |
-
cm = confusion_matrix(y_test, y_pred)
|
| 109 |
-
roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
|
| 110 |
-
report = classification_report(y_test, y_pred)
|
| 111 |
-
|
| 112 |
-
return {
|
| 113 |
-
"accuracy": accuracy,
|
| 114 |
-
"f1_score": f1,
|
| 115 |
-
"confusion_matrix": cm,
|
| 116 |
-
"roc_auc": roc_auc,
|
| 117 |
-
"classification_report": report
|
| 118 |
-
}
|
| 119 |
-
|
| 120 |
-
def train_model_from_scratch(limit: Optional[int] = None,
|
| 121 |
-
evaluate: bool = False,
|
| 122 |
-
algo: all_algorithms = 'MLP',
|
| 123 |
-
output_path: str = './data/model.pkl') -> Pipeline:
|
| 124 |
-
"""
|
| 125 |
-
Train a model from scratch
|
| 126 |
-
"""
|
| 127 |
-
# Load data
|
| 128 |
-
data = load_matches_from_postgres(table_name='atp_data')
|
| 129 |
-
|
| 130 |
-
# Train the model
|
| 131 |
-
pipeline = create_and_train_model(data=data, evaluate=evaluate, algo=algo)
|
| 132 |
-
|
| 133 |
-
# Save the model
|
| 134 |
-
# metadata = {
|
| 135 |
-
# "model_name": algo,
|
| 136 |
-
# "version": "1.0",
|
| 137 |
-
# "training_data_size": len(data),
|
| 138 |
-
# "training_datetime": pd.Timestamp.now().isoformat(),
|
| 139 |
-
# }
|
| 140 |
-
# Model.save_model(pipeline, metadata, output_path)
|
| 141 |
-
|
| 142 |
-
return pipeline
|
| 143 |
-
|
| 144 |
-
def create_pipeline(algo: all_algorithms = 'XGBoost') -> Pipeline:
|
| 145 |
-
"""
|
| 146 |
-
Creates a machine learning pipeline.
|
| 147 |
-
|
| 148 |
-
Returns:
|
| 149 |
-
Pipeline: A scikit-learn pipeline object.
|
| 150 |
-
"""
|
| 151 |
-
# Define the features, numerical and categorical
|
| 152 |
-
cat_features = [f.name for f in Feature.get_features_by_type('category')]
|
| 153 |
-
num_features = [f.name for f in Feature.get_features_by_type('number')]
|
| 154 |
-
|
| 155 |
-
# Pipeline for numerical variables
|
| 156 |
-
num_transformer = Pipeline(steps=[
|
| 157 |
-
('imputer', SimpleImputer(strategy='mean')),
|
| 158 |
-
('scaler', StandardScaler())
|
| 159 |
-
])
|
| 160 |
-
|
| 161 |
-
# Pipeline for categorical variables
|
| 162 |
-
cat_transformer = OneHotEncoder(handle_unknown='ignore')
|
| 163 |
-
|
| 164 |
-
# Preprocessor
|
| 165 |
-
preprocessor = ColumnTransformer(
|
| 166 |
-
transformers=[
|
| 167 |
-
('num', num_transformer, num_features),
|
| 168 |
-
('cat', cat_transformer, cat_features)
|
| 169 |
-
]
|
| 170 |
-
)
|
| 171 |
-
|
| 172 |
-
# Choose the classifier based on the algorithm
|
| 173 |
-
if algo == 'XGBoost':
|
| 174 |
-
from xgboost import XGBClassifier
|
| 175 |
-
classifier = XGBClassifier(eval_metric='logloss')
|
| 176 |
-
elif algo == 'RandomForest':
|
| 177 |
-
from sklearn.ensemble import RandomForestClassifier
|
| 178 |
-
classifier = RandomForestClassifier()
|
| 179 |
-
elif algo == 'SVM':
|
| 180 |
-
from sklearn.svm import SVC
|
| 181 |
-
classifier = SVC(probability=False)
|
| 182 |
-
elif algo == 'GradientBoosting':
|
| 183 |
-
from sklearn.ensemble import GradientBoostingClassifier
|
| 184 |
-
classifier = GradientBoostingClassifier()
|
| 185 |
-
elif algo == 'MLP':
|
| 186 |
-
from sklearn.neural_network import MLPClassifier
|
| 187 |
-
classifier = MLPClassifier(max_iter=1000, verbose=True)
|
| 188 |
-
elif algo == 'LightGBM':
|
| 189 |
-
from lightgbm import LGBMClassifier
|
| 190 |
-
classifier = LGBMClassifier()
|
| 191 |
-
elif algo == 'XGBRF':
|
| 192 |
-
from xgboost import XGBRFClassifier
|
| 193 |
-
classifier = XGBRFClassifier(eval_metric='logloss')
|
| 194 |
-
elif algo == 'DecisionTree':
|
| 195 |
-
from sklearn.tree import DecisionTreeClassifier
|
| 196 |
-
classifier = DecisionTreeClassifier()
|
| 197 |
-
elif algo == 'ExtraTrees':
|
| 198 |
-
from sklearn.ensemble import ExtraTreesClassifier
|
| 199 |
-
classifier = ExtraTreesClassifier()
|
| 200 |
-
elif algo == 'Bagging':
|
| 201 |
-
from sklearn.ensemble import BaggingClassifier
|
| 202 |
-
classifier = BaggingClassifier()
|
| 203 |
-
else:
|
| 204 |
-
raise ValueError(f"Unknown algorithm: {algo}")
|
| 205 |
-
|
| 206 |
-
# Full pipeline
|
| 207 |
-
pipeline = Pipeline(steps=[
|
| 208 |
-
('preprocessor', preprocessor),
|
| 209 |
-
('classifier', classifier)
|
| 210 |
-
])
|
| 211 |
-
|
| 212 |
-
return pipeline
|
| 213 |
-
|
| 214 |
-
def predict(
|
| 215 |
-
pipeline: Pipeline,
|
| 216 |
-
job: str,
|
| 217 |
-
city: str,
|
| 218 |
-
state: str,
|
| 219 |
-
category: str,
|
| 220 |
-
amt: float,
|
| 221 |
-
city_pop: int
|
| 222 |
-
) -> Dict[str, Any]:
|
| 223 |
-
# Built a DataFrame with the new match
|
| 224 |
-
transaction = pd.DataFrame([{
|
| 225 |
-
Feature.CUSTOMER_CITY.name: city,
|
| 226 |
-
Feature.CUSTOMER_CITY_POP.name: city_pop,
|
| 227 |
-
Feature.CUSTOMER_JOB.name: job,
|
| 228 |
-
Feature.CUSTOMER_STATE.name: state,
|
| 229 |
-
Feature.TRANSACTION_AMOUNT.name: amt,
|
| 230 |
-
Feature.TRANSACTION_CATEGORY.name: category
|
| 231 |
-
}])
|
| 232 |
-
|
| 233 |
-
# Use the pipeline to make a prediction
|
| 234 |
-
prediction = pipeline.predict(transaction)[0]
|
| 235 |
-
proba = pipeline.predict_proba(transaction)[0]
|
| 236 |
-
|
| 237 |
-
# Print the result
|
| 238 |
-
logging.info(f"Is fraud: {'True' if prediction == 1 else 'False'}")
|
| 239 |
-
print(f"Probability of fraud: {proba}")
|
| 240 |
-
# Return the result
|
| 241 |
-
return {"result": prediction.item(), "fraud_probability": proba[1].item()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_model.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
from sklearn.pipeline import Pipeline
|
| 3 |
-
|
| 4 |
-
from src.model import create_pairwise_data, create_pipeline
|
| 5 |
-
|
| 6 |
-
def test_create_pairwise_data(simple_match: pd.DataFrame, simple_match_pairwise_data: pd.DataFrame):
|
| 7 |
-
result = create_pairwise_data(simple_match)
|
| 8 |
-
|
| 9 |
-
assert set(result.columns) == set(simple_match_pairwise_data.columns), "Columns are different"
|
| 10 |
-
assert simple_match_pairwise_data.equals(result), "Dataframes are different"
|
| 11 |
-
|
| 12 |
-
def test_create_pairwise_data_empty(simple_match_empty: pd.DataFrame):
|
| 13 |
-
result = create_pairwise_data(simple_match_empty)
|
| 14 |
-
|
| 15 |
-
assert result.empty, "Dataframe is not empty"
|
| 16 |
-
|
| 17 |
-
def test_create_pipeline():
|
| 18 |
-
pipeline = create_pipeline()
|
| 19 |
-
assert pipeline is not None, "Pipeline is None"
|
| 20 |
-
assert isinstance(pipeline, Pipeline), "Pipeline is not a Pipeline"
|
| 21 |
-
assert len(pipeline.named_steps) == 2, "Pipeline has wrong number of steps"
|
| 22 |
-
assert 'preprocessor' in pipeline.named_steps, "Preprocessor is missing"
|
| 23 |
-
assert 'classifier' in pipeline.named_steps, "Classifier is missing"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|