File size: 9,621 Bytes
eb5ec73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from fastapi import FastAPI, Depends, HTTPException, status
from fastapi.security import OAuth2PasswordBearer
from pydantic import BaseModel
from typing import List, Optional
import requests
import asyncio

from src.features.build_features import DataImporter, TextPreprocessor, ImagePreprocessor
from src.models.train_model_API import TextRnnModel, ImageVGG16Model, concatenate
from tensorflow import keras
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
import tensorflow as tf
import sys
import json
import time
import numpy as np
import pandas as pd
from src.tools import f1_m, load_model
import sys
import random
import datetime

# Instanciate your FastAPI app
app = FastAPI()
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")

class TrainInput(BaseModel):
    x_train_path: Optional[str] = "data/preprocessed/X_train_update.csv"
    y_train_path: Optional[str] = "data/preprocessed/Y_train_CVw08PX.csv"
    images_path: Optional[str] = "data/preprocessed/image_train"
    model_path: Optional[str] = "models"
    n_epochs: Optional[int] = 1
    samples_per_class: Optional[int] = 50  # Caution: If samples_per_class==0 , the Full Dataset will be used
    with_test: Optional[bool] = False
    random_state: Optional[int] = 42
    full_train: Optional[bool] = True
    n_sales_ft: Optional[int] = 50
    api_secured: Optional[bool] = False

@app.post("/train")
async def main(input_data: TrainInput, token: Optional[str] = Depends(oauth2_scheme)):
    
    # Si api_secured est True, vérifiez les crédentiels
    if input_data.api_secured:
        auth_response = requests.get("http://api_oauth:8001/secured", headers={"Authorization": f"Bearer {token}"})
        if auth_response.status_code != 200:
            raise HTTPException(status_code=auth_response.status_code, detail="Non autorisé à accéder à l'entrainment du modèle")
        else:
            user_data = auth_response.json()
            user_info = user_data['FirstName']+" "+user_data['LastName']
            if user_data['Authorization'] < 2:
                prediction_response = {"message": f"{user_info} n'est pas autorisé à effectuer l'entrainment du modèle"}
                return prediction_response
    else:
        user_info = "un utilisateur inconnu"
    
    with_test = False if input_data.with_test==0 else True
    samples_per_class = input_data.samples_per_class
    n_epochs = input_data.n_epochs
    full_train = input_data.full_train
    n_sales_ft = input_data.n_sales_ft
    random_state = input_data.random_state if input_data.random_state >= 0 else random.randint(0, 100)

    t_debut = time.time()
    data_importer = DataImporter(input_data.x_train_path,input_data.y_train_path, input_data.model_path )
    df = data_importer.load_data()
    
    if full_train:
        X_train, X_val, X_test, y_train, y_val, y_test = \
            data_importer.split_train_test(df, samples_per_class=samples_per_class, random_state=random_state, with_test=with_test) 
    else:
        X_train, X_val, X_test, y_train, y_val, y_test = \
            data_importer.split_train_test(df, samples_per_class=10, random_state=random_state, with_test=with_test) 
        df2 = df[-n_sales_ft:]
        y_train2 = df2["prdtypecode"]
        X_train2 = df2.drop(["prdtypecode"], axis=1)
        y_train = pd.concat([y_train,y_train2], axis=0)
        X_train = pd.concat([X_train,X_train2], axis=0)
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        print('============================')
        print("Final Finetuning Dataset size : ", len(X_train)+len(X_val)+len(X_test))
        print("Final Finetuning Train size   : ", len(X_train))
        print("Final Finetuning Val size     : ", len(X_val))
        print("Final Finetuning Test size    : ", len(X_test))
        print('============================')
        samples_per_class = 0    

    # Preprocess text and images
    text_preprocessor = TextPreprocessor()
    image_preprocessor = ImagePreprocessor(input_data.images_path)
    text_preprocessor.preprocess_text_in_df(X_train, columns=["description"])
    text_preprocessor.preprocess_text_in_df(X_val, columns=["description"])
    image_preprocessor.preprocess_images_in_df(X_train)
    image_preprocessor.preprocess_images_in_df(X_val)
    if with_test:
        text_preprocessor.preprocess_text_in_df(X_test, columns=["description"])
        image_preprocessor.preprocess_images_in_df(X_test)

    # sys.exit(0)
    # Train Rnn model
    print('============================')
    print("Training RNN Model")
    text_rnn_model = TextRnnModel(file_path=input_data.model_path)
    rnn_history, rnn_best_epoch, rnn_best_f1, rnn_best_accuracy = text_rnn_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs, full_train=full_train)
    print("Finished training RNN")
    
    print('============================')
    print("Training VGG")
    # Train VGG16 model
    image_vgg16_model = ImageVGG16Model(file_path=input_data.model_path)
    vgg16_history, vgg16_best_epoch, vgg16_best_f1, vgg16_best_accuracy = image_vgg16_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs, full_train=full_train)
    print("Finished training VGG")
    
    print('============================')
    with open(input_data.model_path+"/tokenizer_config.json", "r", encoding="utf-8") as json_file:
        tokenizer_config = json_file.read()
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config)

    rnn = load_model(input_data.model_path,"best_rnn_model.h5")
    vgg16 = load_model(input_data.model_path,"best_vgg16_model.h5")
           
    print("Training the concatenate model")
    model_concatenate = concatenate(tokenizer, rnn, vgg16)
    if (samples_per_class > 0):
        new_samples_per_class = min(samples_per_class,50)  # max(int(samples_per_class/12),3) # 50
    else:
        if full_train:
            new_samples_per_class = 50
        else:
            new_samples_per_class = 0

    rnn_proba, vgg16_proba, new_y_train = model_concatenate.predict(X_train, y_train, new_samples_per_class=new_samples_per_class, random_state=random_state)  
    best_weights, best_weighted_f1, best_accuracy, concatenate_train_size = model_concatenate.optimize(rnn_proba, vgg16_proba, new_y_train)

    with open(input_data.model_path+"/best_weights.json", "w") as file:
        json.dump(best_weights, file)
          
    t_fin = time.time()
    training_duration = t_fin - t_debut
    print("Training duration = {:.2f} sec".format(training_duration))
    print("Finished training concatenate model")
    print('============================')
    
    # Enregistre le modèle au format h5
    # concatenate_model.save(input_data.model_path+"/concatenate.h5")
    
    # Calcul de la perforance sur le dataset test
    t_debut = time.time()
    t_fin = t_debut
    concatenate_test_size = 0
    test_accuracy = 0
    test_f1 = 0
    if with_test:
        rnn_proba_test, vgg16_proba_test, new_y_test = model_concatenate.predict(X_test, y_test, new_samples_per_class=0, random_state=random_state) 
        combined_predictions = (best_weights[0] * rnn_proba_test) + (best_weights[1] * vgg16_proba_test)
        final_predictions = np.argmax(combined_predictions, axis=1)
        concatenate_test_size = len(new_y_test)
        test_accuracy = accuracy_score(new_y_test, final_predictions)
        test_f1 = f1_score(new_y_test , final_predictions, average='weighted')
        t_fin = time.time()
        print('============================')
        print("Testing the concatenate model")
        print("Test dataset size :", concatenate_test_size)
        print("Test: f1 score =", test_f1)
        print("Test accuracy score =", test_accuracy)
        print("Test duration = {:.2f} sec".format(t_fin - t_debut))
        print('============================')
    
    test_duration = t_fin - t_debut
    train_size = int(len(X_train))
    val_size = int(len(X_val))
    test_size = int(len(X_test))
    performances_recap = {
        "Date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "Input": {
            "epochs_requested": int(n_epochs),
            "samples_per_class": int(samples_per_class),
            "with_test": int(with_test),
            "random_state": int(random_state),
            "Dataset_size": {
                "Train": train_size,
                "Val": val_size,
                "Test": test_size
                }    
        },
        "Text" : {
            "best_epoch": int(rnn_best_epoch+1), 
            "f1": float(rnn_best_f1),
            "accuracy" : float(rnn_best_accuracy)
        },
        "VGG16" : {
            "best_epoch": int(vgg16_best_epoch+1), 
            "f1": float(vgg16_best_f1),
            "accuracy" : float(vgg16_best_accuracy)
        },
        "Concatenate" : {
            "weight": best_weights,
            "Train": {
                "f1": float(best_weighted_f1),
                "accuracy": float(best_accuracy),
                "duration" : int(training_duration),
                "size": int(concatenate_train_size)
            },
            "Test": {
                "f1": float(test_f1),
                "accuracy": float(test_accuracy),
                "duration" : int(test_duration),
                "size": concatenate_test_size
            }
        }
    }
    with open(input_data.model_path+"/performances.json", "w") as file:
        json.dump( performances_recap, file, indent=4)
        
    return {"message": "Entrainement effectuée avec succès"}