sghorbal commited on
Commit
51c1df4
·
1 Parent(s): 8b78aa0

remove unused files

Browse files
Files changed (3) hide show
  1. src/model.py +0 -316
  2. src/service/model.py +0 -241
  3. tests/test_model.py +0 -23
src/model.py DELETED
@@ -1,316 +0,0 @@
1
- import os
2
- import time
3
- import joblib
4
- import logging
5
- import pandas as pd
6
- from dotenv import load_dotenv
7
- from typing import Literal, Any, Tuple, Dict, List
8
- import mlflow
9
- from mlflow.models import infer_signature
10
- from mlflow.tracking import MlflowClient
11
- from sklearn.model_selection import train_test_split
12
- from sklearn.impute import SimpleImputer
13
- from sklearn.preprocessing import OneHotEncoder, StandardScaler
14
- from sklearn.compose import ColumnTransformer
15
- from sklearn.linear_model import LogisticRegression
16
- from sklearn.pipeline import Pipeline
17
- from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
18
-
19
- from src.repository.sql import load_matches_from_postgres
20
- from src.enums import Feature
21
-
22
- load_dotenv()
23
-
24
- models = {}
25
-
26
- def create_pairwise_data(df: pd.DataFrame) -> pd.DataFrame:
27
- """
28
- Creates a balanced dataset with pairwise comparisons
29
- """
30
- records = []
31
- for _, row in df.iterrows():
32
- # Record 1 : original order (winner in position 1, loser in position 2)
33
- record_1 = {
34
- Feature.SERIES.name: row['series'],
35
- Feature.SURFACE.name: row['surface'],
36
- Feature.COURT.name: row['court'],
37
- Feature.ROUND.name: row['round'],
38
- Feature.DIFF_RANKING.name: row['w_rank'] - row['l_rank'], # rank difference
39
- Feature.DIFF_POINTS.name: row['w_points'] - row['l_points'], # points difference
40
- 'target': 1 # Player in first position won
41
- }
42
-
43
- # Record 2 : invert players
44
- record_2 = record_1.copy()
45
- record_2[Feature.DIFF_RANKING.name] = -record_2['diffRanking'] # Invert the ranking difference
46
- record_2[Feature.DIFF_POINTS.name] = -record_2['diffPoints'] # Invert the points difference
47
- record_2['target'] = 0 # Player in first position lost
48
-
49
- records.append(record_1)
50
- records.append(record_2)
51
-
52
- return pd.DataFrame(records)
53
-
54
- def create_pipeline() -> Pipeline:
55
- """
56
- Creates a machine learning pipeline with SimpleImputer, StandardScaler, OneHotEncoder and LogisticRegression.
57
-
58
- Returns:
59
- Pipeline: A scikit-learn pipeline object.
60
- """
61
- # Define the features, numerical and categorical
62
- cat_features = [f.name for f in Feature.get_features_by_type('category')]
63
- num_features = [f.name for f in Feature.get_features_by_type('number')]
64
-
65
- # Pipeline for numerical variables
66
- num_transformer = Pipeline(steps=[
67
- ('imputer', SimpleImputer(strategy='mean')),
68
- ('scaler', StandardScaler())
69
- ])
70
-
71
- # Pipeline for categorical variables
72
- cat_transformer = OneHotEncoder(handle_unknown='ignore')
73
-
74
- # Preprocessor
75
- preprocessor = ColumnTransformer(
76
- transformers=[
77
- ('num', num_transformer, num_features),
78
- ('cat', cat_transformer, cat_features)
79
- ]
80
- )
81
-
82
- # Full pipeline
83
- pipeline = Pipeline(steps=[
84
- ('preprocessor', preprocessor),
85
- ('classifier', LogisticRegression(solver='lbfgs', max_iter=1000))
86
- ])
87
-
88
- return pipeline
89
-
90
- def train_model_from_scratch(
91
- circuit: Literal['atp', 'wta'],
92
- from_date: str,
93
- to_date: str,
94
- output_path: str = '/data/model.pkl') -> Pipeline:
95
- """
96
- Train a model from scratch
97
- """
98
- # Load data
99
- data = load_matches_from_postgres(
100
- table_name=f"{circuit}_data",
101
- from_date=from_date,
102
- to_date=to_date)
103
-
104
- # Train the model
105
- pipeline = create_and_train_model(data)
106
-
107
- # Save the model
108
- joblib.dump(pipeline, output_path)
109
-
110
- return pipeline
111
-
112
- def create_and_train_model(data: pd.DataFrame) -> Pipeline:
113
- """
114
- Create and train a model on the given data
115
- """
116
- # Split the data
117
- X_train, _, y_train, _ = preprocess_data(data)
118
-
119
- # Train the model
120
- pipeline = create_pipeline()
121
- pipeline = train_model(pipeline, X_train, y_train)
122
-
123
- return pipeline
124
-
125
- def train_model(
126
- pipeline: Pipeline,
127
- X_train: pd.DataFrame,
128
- y_train: pd.DataFrame) -> Pipeline:
129
- """
130
- Train the pipeline
131
- """
132
- pipeline.fit(X_train, y_train)
133
- return pipeline
134
-
135
- def preprocess_data(df: pd.DataFrame) -> Tuple:
136
- """
137
- Split the dataframe into X (features) and y (target).
138
-
139
- Args:
140
- df (pd.DataFrame): Input dataframe.
141
-
142
- Returns:
143
- Tuple: Split data (X_train, X_test, y_train, y_test).
144
- """
145
- # Format data for the model
146
- df_model = create_pairwise_data(df)
147
-
148
- features = [f.name for f in Feature.get_all_features()]
149
- X = df_model[features]
150
- y = df_model['target']
151
-
152
- # Split the data
153
- return train_test_split(X, y, test_size=0.2)
154
-
155
- def evaluate_model(pipeline: Pipeline, X_test: pd.DataFrame, y_test: pd.Series) -> Dict:
156
- """
157
- Evaluates the model
158
- """
159
- y_pred = pipeline.predict(X_test)
160
- accuracy = accuracy_score(y_test, y_pred)
161
- roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
162
- cm = confusion_matrix(y_test, y_pred)
163
-
164
- return {
165
- "accuracy": accuracy,
166
- "roc_auc": roc_auc,
167
- "confusion_matrix": cm
168
- }
169
-
170
- def predict(
171
- pipeline: Pipeline,
172
- series: str,
173
- surface: str,
174
- court: str,
175
- round_stage: str,
176
- rank_player_1: int,
177
- rank_player_2: int,
178
- points_player_1: int,
179
- points_player_2: int
180
- ) -> Dict[str, Any]:
181
- diffRanking = rank_player_1 - rank_player_2
182
- diffPoints = points_player_1 - points_player_2
183
-
184
- # Built a DataFrame with the new match
185
- new_match = pd.DataFrame([{
186
- Feature.SERIES.name: series,
187
- Feature.SURFACE.name: surface,
188
- Feature.COURT.name: court,
189
- Feature.ROUND.name: round_stage,
190
- Feature.DIFF_RANKING.name: diffRanking,
191
- Feature.DIFF_POINTS.name: diffPoints
192
- }])
193
-
194
- # Use the pipeline to make a prediction
195
- prediction = pipeline.predict(new_match)[0]
196
- proba = pipeline.predict_proba(new_match)[0]
197
-
198
- # Print the result
199
- logging.info("\n--- 📊 Result ---")
200
- logging.info(f"🏆 Win probability : {proba[1]:.2f}")
201
- logging.info(f"❌ Lose probability : {proba[0]:.2f}")
202
- logging.info(f"🎾 Prediction : {'Victory' if prediction == 1 else 'Loss'}")
203
-
204
- return {"result": prediction.item(), "prob": [p.item() for p in proba]}
205
-
206
- def run_experiment(
207
- circuit: Literal['atp', 'wta'],
208
- from_date: str,
209
- to_date: str,
210
- artifact_path: str = None,
211
- registered_model_name: str = 'LogisticRegression',
212
- experiment_name: str = 'Logistic Tennis Prediction',
213
- ):
214
- """
215
- Run the entire ML experiment pipeline.
216
-
217
- Args:
218
- experiment_name (str): Name of the MLflow experiment.
219
- data_url (str): URL to load the dataset.
220
- artifact_path (str): Path to store the model artifact.
221
- registered_model_name (str): Name to register the model under in MLflow.
222
- """
223
- if not artifact_path:
224
- artifact_path = f'{circuit}_model'
225
-
226
- # Set tracking URI to your mlflow application
227
- mlflow.set_tracking_uri(os.environ["MLFLOW_SERVER_URI"])
228
-
229
- # Start timing
230
- start_time = time.time()
231
-
232
- # Load and preprocess data
233
- df = load_matches_from_postgres(
234
- table_name=f"{circuit}_data",
235
- from_date=from_date,
236
- to_date=to_date)
237
- X_train, X_test, y_train, y_test = preprocess_data(df)
238
-
239
- # Create pipeline
240
- pipe = create_pipeline()
241
-
242
- # Set experiment's info
243
- mlflow.set_experiment(experiment_name)
244
-
245
- # Get our experiment info
246
- experiment = mlflow.get_experiment_by_name(experiment_name)
247
-
248
- # Call mlflow autolog
249
- mlflow.sklearn.autolog()
250
-
251
- with mlflow.start_run(experiment_id=experiment.experiment_id):
252
- # Train model
253
- train_model(pipe, X_train, y_train)
254
-
255
- # Store metrics
256
- # predicted_output = pipe.predict(X_test.values)
257
- accuracy = pipe.score(X_test, y_test)
258
-
259
- # Print results
260
- logging.info("LogisticRegression model")
261
- logging.info("Accuracy: {}".format(accuracy))
262
- signature = infer_signature(X_test, pipe.predict(X_test))
263
-
264
- mlflow.sklearn.log_model(
265
- sk_model=pipe,
266
- artifact_path=artifact_path,
267
- registered_model_name=registered_model_name,
268
- signature=signature
269
- )
270
-
271
- # Print timing
272
- logging.info(f"...Training Done! --- Total training time: {time.time() - start_time} seconds")
273
-
274
- def list_registered_models() -> List[Dict]:
275
- """
276
- List all the registered models
277
- """
278
- # Set tracking URI to your Heroku application
279
- tracking_uri = os.environ.get("MLFLOW_SERVER_URI")
280
- if tracking_uri is None:
281
- raise ValueError("MLFLOW_SERVER_URI environment variable is not set.")
282
-
283
- client = MlflowClient(tracking_uri=tracking_uri)
284
- # Should be:
285
- # results = client.search_registered_models()
286
- # but this is not working from inside the container
287
- # so we need to use the store client to get the registered models
288
- results = client._get_registry_client().store.search_registered_models()
289
-
290
- output = []
291
- for res in results:
292
- for mv in res.latest_versions:
293
- output.append({"name": mv.name, "run_id": mv.run_id, "version": mv.version})
294
-
295
- return output
296
-
297
- def load_model(name: str, version: str = 'latest') -> Pipeline:
298
- """
299
- Load a model from MLflow
300
- """
301
- if name in models.keys():
302
- return models[name]
303
-
304
- mlflow.set_tracking_uri(os.environ["MLFLOW_SERVER_URI"])
305
- client = MlflowClient()
306
-
307
- model_info = client.get_registered_model(name)
308
-
309
- # Load the model
310
- pipeline = mlflow.sklearn.load_model(model_uri=model_info.latest_versions[0].source)
311
-
312
- logging.info(f'Model {name} loaded')
313
-
314
- models[name] = pipeline
315
-
316
- return pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/service/model.py DELETED
@@ -1,241 +0,0 @@
1
- import logging
2
- import pandas as pd
3
- from typing import Optional, Tuple, Dict, Literal, Any
4
- from sklearn.model_selection import train_test_split
5
- from sklearn.pipeline import Pipeline
6
- from sklearn.impute import SimpleImputer
7
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
8
- from sklearn.compose import ColumnTransformer
9
- from sklearn.metrics import (
10
- accuracy_score,
11
- confusion_matrix,
12
- f1_score,
13
- roc_auc_score,
14
- classification_report
15
- )
16
- from src.enums import Feature
17
- from src.repository.sql import load_matches_from_postgres
18
- # from src.entity.model import Model
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
- def preprocess_data(df: pd.DataFrame, test_size: float = 0.2) -> Tuple:
23
- """
24
- Split the dataframe into X (features) and y (target).
25
-
26
- Args:
27
- df (pd.DataFrame): Input dataframe.
28
-
29
- Returns:
30
- Tuple: Split data (X_train, X_test, y_train, y_test).
31
- """
32
- # Format data for the model
33
- df_model = df
34
-
35
- features = [f.name for f in Feature.get_all_features() if f not in [Feature.DIFF_POINTS, Feature.ROUND]]
36
- X = df_model[features]
37
- y = df_model['target']
38
-
39
- # Split the data
40
- if test_size > 0:
41
- return train_test_split(X, y, test_size=test_size, stratify=df_model.target, random_state=42)
42
- else:
43
- return X, pd.DataFrame(), y, pd.DataFrame()
44
-
45
- def train_model(
46
- pipeline: Pipeline,
47
- X_train: pd.DataFrame,
48
- y_train: pd.DataFrame) -> Pipeline:
49
- """
50
- Train the pipeline
51
- """
52
- # Start the timer
53
- import time
54
- start_time = time.time()
55
- print("Training the model...")
56
- pipeline.fit(X_train, y_train)
57
- print(f"Model trained in {time.time() - start_time:.2f} seconds")
58
- return pipeline
59
-
60
- all_algorithms = Literal[
61
- 'XGBoost',
62
- 'RandomForest',
63
- 'SVM',
64
- 'GradientBoosting',
65
- 'MLP',
66
- 'LightGBM',
67
- 'XGBRF',
68
- 'DecisionTree',
69
- 'ExtraTrees',
70
- 'Bagging',
71
- ]
72
-
73
- def create_and_train_model(data: pd.DataFrame,
74
- evaluate: bool = False,
75
- algo: all_algorithms = 'MLP') -> Pipeline:
76
- """
77
- Create and train a model on the given data
78
- """
79
- if evaluate:
80
- test_size = 0.2
81
- else:
82
- test_size = 0.0
83
-
84
- # Split the data
85
- X_train, X_test, y_train, y_test = preprocess_data(df=data, test_size=test_size)
86
-
87
- # Train the model
88
- pipeline = create_pipeline(algo)
89
- pipeline = train_model(pipeline, X_train, y_train)
90
-
91
- if evaluate:
92
- evaluation_results = evaluate_model(pipeline, X_test, y_test)
93
- logging.info(f"Evaluation results for {algo}:")
94
- logging.info(f"F1 Score: {evaluation_results['f1_score']}\n")
95
- logging.info(f"Confusion Matrix:\n{evaluation_results['confusion_matrix']}\n")
96
- logging.info(f"ROC AUC: {evaluation_results['roc_auc']}\n")
97
- logging.info(f"Classification Report:\n{evaluation_results['classification_report']}\n")
98
-
99
- return pipeline
100
-
101
- def evaluate_model(pipeline: Pipeline, X_test: pd.DataFrame, y_test: pd.Series) -> Dict:
102
- """
103
- Evaluates the model
104
- """
105
- y_pred = pipeline.predict(X_test)
106
- accuracy = accuracy_score(y_test, y_pred)
107
- f1 = f1_score(y_test, y_pred)
108
- cm = confusion_matrix(y_test, y_pred)
109
- roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
110
- report = classification_report(y_test, y_pred)
111
-
112
- return {
113
- "accuracy": accuracy,
114
- "f1_score": f1,
115
- "confusion_matrix": cm,
116
- "roc_auc": roc_auc,
117
- "classification_report": report
118
- }
119
-
120
- def train_model_from_scratch(limit: Optional[int] = None,
121
- evaluate: bool = False,
122
- algo: all_algorithms = 'MLP',
123
- output_path: str = './data/model.pkl') -> Pipeline:
124
- """
125
- Train a model from scratch
126
- """
127
- # Load data
128
- data = load_matches_from_postgres(table_name='atp_data')
129
-
130
- # Train the model
131
- pipeline = create_and_train_model(data=data, evaluate=evaluate, algo=algo)
132
-
133
- # Save the model
134
- # metadata = {
135
- # "model_name": algo,
136
- # "version": "1.0",
137
- # "training_data_size": len(data),
138
- # "training_datetime": pd.Timestamp.now().isoformat(),
139
- # }
140
- # Model.save_model(pipeline, metadata, output_path)
141
-
142
- return pipeline
143
-
144
- def create_pipeline(algo: all_algorithms = 'XGBoost') -> Pipeline:
145
- """
146
- Creates a machine learning pipeline.
147
-
148
- Returns:
149
- Pipeline: A scikit-learn pipeline object.
150
- """
151
- # Define the features, numerical and categorical
152
- cat_features = [f.name for f in Feature.get_features_by_type('category')]
153
- num_features = [f.name for f in Feature.get_features_by_type('number')]
154
-
155
- # Pipeline for numerical variables
156
- num_transformer = Pipeline(steps=[
157
- ('imputer', SimpleImputer(strategy='mean')),
158
- ('scaler', StandardScaler())
159
- ])
160
-
161
- # Pipeline for categorical variables
162
- cat_transformer = OneHotEncoder(handle_unknown='ignore')
163
-
164
- # Preprocessor
165
- preprocessor = ColumnTransformer(
166
- transformers=[
167
- ('num', num_transformer, num_features),
168
- ('cat', cat_transformer, cat_features)
169
- ]
170
- )
171
-
172
- # Choose the classifier based on the algorithm
173
- if algo == 'XGBoost':
174
- from xgboost import XGBClassifier
175
- classifier = XGBClassifier(eval_metric='logloss')
176
- elif algo == 'RandomForest':
177
- from sklearn.ensemble import RandomForestClassifier
178
- classifier = RandomForestClassifier()
179
- elif algo == 'SVM':
180
- from sklearn.svm import SVC
181
- classifier = SVC(probability=False)
182
- elif algo == 'GradientBoosting':
183
- from sklearn.ensemble import GradientBoostingClassifier
184
- classifier = GradientBoostingClassifier()
185
- elif algo == 'MLP':
186
- from sklearn.neural_network import MLPClassifier
187
- classifier = MLPClassifier(max_iter=1000, verbose=True)
188
- elif algo == 'LightGBM':
189
- from lightgbm import LGBMClassifier
190
- classifier = LGBMClassifier()
191
- elif algo == 'XGBRF':
192
- from xgboost import XGBRFClassifier
193
- classifier = XGBRFClassifier(eval_metric='logloss')
194
- elif algo == 'DecisionTree':
195
- from sklearn.tree import DecisionTreeClassifier
196
- classifier = DecisionTreeClassifier()
197
- elif algo == 'ExtraTrees':
198
- from sklearn.ensemble import ExtraTreesClassifier
199
- classifier = ExtraTreesClassifier()
200
- elif algo == 'Bagging':
201
- from sklearn.ensemble import BaggingClassifier
202
- classifier = BaggingClassifier()
203
- else:
204
- raise ValueError(f"Unknown algorithm: {algo}")
205
-
206
- # Full pipeline
207
- pipeline = Pipeline(steps=[
208
- ('preprocessor', preprocessor),
209
- ('classifier', classifier)
210
- ])
211
-
212
- return pipeline
213
-
214
- def predict(
215
- pipeline: Pipeline,
216
- job: str,
217
- city: str,
218
- state: str,
219
- category: str,
220
- amt: float,
221
- city_pop: int
222
- ) -> Dict[str, Any]:
223
- # Built a DataFrame with the new match
224
- transaction = pd.DataFrame([{
225
- Feature.CUSTOMER_CITY.name: city,
226
- Feature.CUSTOMER_CITY_POP.name: city_pop,
227
- Feature.CUSTOMER_JOB.name: job,
228
- Feature.CUSTOMER_STATE.name: state,
229
- Feature.TRANSACTION_AMOUNT.name: amt,
230
- Feature.TRANSACTION_CATEGORY.name: category
231
- }])
232
-
233
- # Use the pipeline to make a prediction
234
- prediction = pipeline.predict(transaction)[0]
235
- proba = pipeline.predict_proba(transaction)[0]
236
-
237
- # Print the result
238
- logging.info(f"Is fraud: {'True' if prediction == 1 else 'False'}")
239
- print(f"Probability of fraud: {proba}")
240
- # Return the result
241
- return {"result": prediction.item(), "fraud_probability": proba[1].item()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_model.py DELETED
@@ -1,23 +0,0 @@
1
- import pandas as pd
2
- from sklearn.pipeline import Pipeline
3
-
4
- from src.model import create_pairwise_data, create_pipeline
5
-
6
- def test_create_pairwise_data(simple_match: pd.DataFrame, simple_match_pairwise_data: pd.DataFrame):
7
- result = create_pairwise_data(simple_match)
8
-
9
- assert set(result.columns) == set(simple_match_pairwise_data.columns), "Columns are different"
10
- assert simple_match_pairwise_data.equals(result), "Dataframes are different"
11
-
12
- def test_create_pairwise_data_empty(simple_match_empty: pd.DataFrame):
13
- result = create_pairwise_data(simple_match_empty)
14
-
15
- assert result.empty, "Dataframe is not empty"
16
-
17
- def test_create_pipeline():
18
- pipeline = create_pipeline()
19
- assert pipeline is not None, "Pipeline is None"
20
- assert isinstance(pipeline, Pipeline), "Pipeline is not a Pipeline"
21
- assert len(pipeline.named_steps) == 2, "Pipeline has wrong number of steps"
22
- assert 'preprocessor' in pipeline.named_steps, "Preprocessor is missing"
23
- assert 'classifier' in pipeline.named_steps, "Classifier is missing"