mboukabous commited on
Commit
6f1cd08
·
1 Parent(s): 829e3ac

Changes files structures

Browse files
app.py CHANGED
@@ -48,14 +48,9 @@ import sys
48
  import glob
49
  import re
50
 
51
- # Add the project root directory to the Python path
52
- current_dir = os.path.dirname(os.path.abspath(__file__))
53
- project_root = os.path.abspath(os.path.join(current_dir, '../../'))
54
- sys.path.append(project_root)
55
-
56
  def get_model_modules():
57
  # Get the list of available model modules
58
- models_dir = os.path.join(project_root, 'models', 'supervised', 'regression')
59
  model_files = glob.glob(os.path.join(models_dir, '*.py'))
60
 
61
  # Debugging: Print the models directory and found files
@@ -107,7 +102,7 @@ def train_model(model_module, data_option, data_file, data_path, data_name_kaggl
107
  return "Invalid data option selected.", None
108
 
109
  # Prepare command-line arguments
110
- cmd = [sys.executable, os.path.join(project_root, 'scripts', 'train_regression_model.py')]
111
  cmd.extend(['--model_module', model_module])
112
  cmd.extend(['--data_path', data_path])
113
  cmd.extend(['--target_variable', target_variable])
@@ -293,4 +288,4 @@ with gr.Blocks() as demo:
293
  )
294
 
295
  if __name__ == "__main__":
296
- demo.launch(share=True)
 
48
  import glob
49
  import re
50
 
 
 
 
 
 
51
  def get_model_modules():
52
  # Get the list of available model modules
53
+ models_dir = os.path.join('models', 'supervised', 'regression')
54
  model_files = glob.glob(os.path.join(models_dir, '*.py'))
55
 
56
  # Debugging: Print the models directory and found files
 
102
  return "Invalid data option selected.", None
103
 
104
  # Prepare command-line arguments
105
+ cmd = [sys.executable, os.path.join('scripts', 'train_regression_model.py')]
106
  cmd.extend(['--model_module', model_module])
107
  cmd.extend(['--data_path', data_path])
108
  cmd.extend(['--target_variable', target_variable])
 
288
  )
289
 
290
  if __name__ == "__main__":
291
+ demo.launch()
data/preprocessing/README.md DELETED
@@ -1 +0,0 @@
1
- # preprocessing
 
 
data/utils/README.md DELETED
@@ -1 +0,0 @@
1
- # utils
 
 
models/computer_vision/README.md DELETED
@@ -1 +0,0 @@
1
- # computer_vision
 
 
models/deep_learning/README.md DELETED
@@ -1 +0,0 @@
1
- # deep_learning
 
 
models/nlp/README.md DELETED
@@ -1 +0,0 @@
1
- # nlp
 
 
models/reinforcement_learning/README.md DELETED
@@ -1 +0,0 @@
1
- # reinforcement_learning
 
 
models/supervised/classification/README.md DELETED
@@ -1 +0,0 @@
1
- # classification
 
 
models/unsupervised/README.md DELETED
@@ -1 +0,0 @@
1
- # unsupervised
 
 
requirements.txt CHANGED
@@ -8,5 +8,4 @@ catboost==1.2.7
8
  dask[dataframe]==2024.10.0
9
  xgboost==2.1.2
10
  lightgbm==4.5.0
11
- joblib==1.4.2
12
- gradio==5.7.1
 
8
  dask[dataframe]==2024.10.0
9
  xgboost==2.1.2
10
  lightgbm==4.5.0
11
+ joblib==1.4.2
 
scripts/train_classification_model.py DELETED
@@ -1,185 +0,0 @@
1
- """
2
- This script trains classification models using scikit-learn.
3
- It includes data loading, preprocessing, encoding of target variable,
4
- hyperparameter tuning, model evaluation, and saving of models, metrics,
5
- and visualizations.
6
-
7
- Usage:
8
- python train_classification_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
9
- --target_variable TARGET_VARIABLE
10
-
11
- Optional arguments:
12
- --test_size TEST_SIZE
13
- --random_state RANDOM_STATE
14
- --cv_folds CV_FOLDS
15
- --scoring_metric SCORING_METRIC
16
- --model_path MODEL_PATH
17
- --results_path RESULTS_PATH
18
- --visualize
19
- --drop_columns COLUMN_NAMES
20
-
21
- Example:
22
- python train_classification_model.py --model_module logistic_regression
23
- --data_path data/titanic/train.csv
24
- --target_variable Survived --drop_columns PassengerId
25
- --visualize
26
- """
27
-
28
- import os
29
- import sys
30
- import argparse
31
- import importlib
32
- import pandas as pd
33
- import numpy as np
34
- import matplotlib.pyplot as plt
35
- from sklearn.model_selection import train_test_split
36
- from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
37
- confusion_matrix, ConfusionMatrixDisplay)
38
- import joblib
39
-
40
- def main(args):
41
- # Change to the root directory of the project
42
- project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
43
- os.chdir(project_root)
44
- sys.path.insert(0, project_root)
45
-
46
- # Import the hyperparameter tuning and the model modules
47
- from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning
48
- model_module_path = f"models.supervised.classification.{args.model_module}"
49
- model_module = importlib.import_module(model_module_path)
50
-
51
- # Get the model estimator, parameters grid, and the scoring metric
52
- estimator = model_module.estimator
53
- param_grid = model_module.param_grid
54
- scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'accuracy')
55
- model_name = estimator.__class__.__name__
56
-
57
- # Set default paths if not provided
58
- args.model_path = args.model_path or os.path.join('saved_models', model_name)
59
- args.results_path = args.results_path or os.path.join('results', model_name)
60
- os.makedirs(args.results_path, exist_ok=True)
61
-
62
- # Load the dataset
63
- df = pd.read_csv(os.path.join(args.data_path))
64
-
65
- # Drop specified columns
66
- if args.drop_columns:
67
- columns_to_drop = args.drop_columns.split(',')
68
- df = df.drop(columns=columns_to_drop)
69
-
70
- # Define target variable and features
71
- target_variable = args.target_variable
72
- X = df.drop(columns=[target_variable])
73
- y = df[target_variable]
74
-
75
- # Ensure target variable is categorical
76
- if np.issubdtype(y.dtype, np.number) and len(np.unique(y)) > 20:
77
- raise ValueError(f"The target variable '{target_variable}' seems to be continuous. Please ensure it's categorical for classification tasks.")
78
-
79
- # Encode target variable if not numeric
80
- if y.dtype == 'object' or not np.issubdtype(y.dtype, np.number):
81
- from sklearn.preprocessing import LabelEncoder
82
- le = LabelEncoder()
83
- y = le.fit_transform(y)
84
- # Save label encoder for inverse transformation
85
- joblib.dump(le, os.path.join(args.model_path, 'label_encoder.pkl'))
86
-
87
- # Split the data
88
- X_train, X_test, y_train, y_test = train_test_split(
89
- X, y, test_size=args.test_size, random_state=args.random_state, stratify=y)
90
-
91
- # Perform hyperparameter tuning
92
- best_model, best_params = classification_hyperparameter_tuning(
93
- X_train, y_train, estimator, param_grid,
94
- cv=args.cv_folds, scoring=scoring_metric)
95
-
96
- # Evaluate the best model on the test set
97
- y_pred = best_model.predict(X_test)
98
- y_test_actual = y_test
99
-
100
- # Save the trained model
101
- model_output_path = os.path.join(args.model_path, 'best_model.pkl')
102
- os.makedirs(args.model_path, exist_ok=True)
103
- joblib.dump(best_model, model_output_path)
104
- print(f"Trained model saved to {model_output_path}")
105
-
106
- # Calculate metrics
107
- accuracy = accuracy_score(y_test_actual, y_pred)
108
- precision = precision_score(y_test_actual, y_pred, average='weighted', zero_division=0)
109
- recall = recall_score(y_test_actual, y_pred, average='weighted', zero_division=0)
110
- f1 = f1_score(y_test_actual, y_pred, average='weighted', zero_division=0)
111
- print(f"\n{model_name} Classification Metrics on Test Set:")
112
- print(f"- Accuracy: {accuracy:.4f}")
113
- print(f"- Precision: {precision:.4f}")
114
- print(f"- Recall: {recall:.4f}")
115
- print(f"- F1 Score: {f1:.4f}")
116
- # Save metrics
117
- metrics = {'Accuracy': [accuracy], 'Precision': [precision], 'Recall': [recall], 'F1 Score': [f1]}
118
-
119
- # Save metrics to CSV
120
- results_df = pd.DataFrame(metrics)
121
- results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
122
- print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
123
-
124
- if args.visualize:
125
- # Plot Classification Metrics
126
- plt.figure(figsize=(8, 6))
127
- # Extract metrics and values
128
- metric_names = list(metrics.keys())
129
- metric_values = [value[0] for value in metrics.values()] # Extract the single value from each list
130
-
131
- # Create bar chart
132
- plt.bar(metric_names, metric_values, color='skyblue', alpha=0.8)
133
- plt.ylim(0, 1) # Metrics like accuracy, precision, etc., are between 0 and 1
134
- plt.xlabel('Metrics')
135
- plt.ylabel('Scores')
136
- plt.title('Classification Metrics')
137
-
138
- # Save and display the plot
139
- plt.savefig(os.path.join(args.results_path, 'classification_metrics.png'))
140
- plt.show()
141
- print(f"Visualization saved to {os.path.join(args.results_path, 'classification_metrics.png')}")
142
-
143
- # Display and save the confusion matrix
144
- conf_matrix = confusion_matrix(y_test_actual, y_pred)
145
- disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
146
- disp.plot(cmap=plt.cm.Blues, values_format='d') # Format as integers for counts
147
- plt.title(f'{model_name} Confusion Matrix')
148
-
149
- # Save the confusion matrix plot
150
- conf_matrix_path = os.path.join(args.results_path, 'confusion_matrix.png')
151
- plt.savefig(conf_matrix_path)
152
- plt.show()
153
- print(f"Confusion matrix saved to {conf_matrix_path}")
154
-
155
- if __name__ == "__main__":
156
- parser = argparse.ArgumentParser(description="Train a classification model.")
157
- # Model module argument
158
- parser.add_argument('--model_module', type=str, required=True,
159
- help='Name of the classification model module to import.')
160
- # Data arguments
161
- parser.add_argument('--data_path', type=str, required=True,
162
- help='Path to the dataset file including data name.')
163
- parser.add_argument('--target_variable', type=str, required=True,
164
- help='Name of the target variable.')
165
- parser.add_argument('--drop_columns', type=str, default='',
166
- help='Columns to drop from the dataset.')
167
- # Model arguments
168
- parser.add_argument('--test_size', type=float, default=0.2,
169
- help='Proportion for test split.')
170
- parser.add_argument('--random_state', type=int, default=42,
171
- help='Random seed.')
172
- parser.add_argument('--cv_folds', type=int, default=5,
173
- help='Number of cross-validation folds.')
174
- parser.add_argument('--scoring_metric', type=str, default=None,
175
- help='Scoring metric for model evaluation.')
176
- # Output arguments
177
- parser.add_argument('--model_path', type=str, default=None,
178
- help='Path to save the trained model.')
179
- parser.add_argument('--results_path', type=str, default=None,
180
- help='Path to save results and metrics.')
181
- parser.add_argument('--visualize', action='store_true',
182
- help='Generate and save visualizations.')
183
-
184
- args = parser.parse_args()
185
- main(args)