Spaces:
Paused
Paused
| from fastapi import FastAPI, Depends, HTTPException, status | |
| from fastapi.security import OAuth2PasswordBearer | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| import requests | |
| import asyncio | |
| from src.features.build_features import DataImporter, TextPreprocessor, ImagePreprocessor | |
| from src.models.train_model_API import TextRnnModel, ImageVGG16Model, concatenate | |
| from tensorflow import keras | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score | |
| import pickle | |
| import tensorflow as tf | |
| import sys | |
| import json | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| from src.tools import f1_m, load_model | |
| import sys | |
| import random | |
| import datetime | |
| # Instanciate your FastAPI app | |
| app = FastAPI() | |
| oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") | |
| class TrainInput(BaseModel): | |
| x_train_path: Optional[str] = "data/preprocessed/X_train_update.csv" | |
| y_train_path: Optional[str] = "data/preprocessed/Y_train_CVw08PX.csv" | |
| images_path: Optional[str] = "data/preprocessed/image_train" | |
| model_path: Optional[str] = "models" | |
| n_epochs: Optional[int] = 1 | |
| samples_per_class: Optional[int] = 50 # Caution: If samples_per_class==0 , the Full Dataset will be used | |
| with_test: Optional[bool] = False | |
| random_state: Optional[int] = 42 | |
| full_train: Optional[bool] = True | |
| n_sales_ft: Optional[int] = 50 | |
| api_secured: Optional[bool] = False | |
| async def main(input_data: TrainInput, token: Optional[str] = Depends(oauth2_scheme)): | |
| # Si api_secured est True, vérifiez les crédentiels | |
| if input_data.api_secured: | |
| auth_response = requests.get("http://api_oauth:8001/secured", headers={"Authorization": f"Bearer {token}"}) | |
| if auth_response.status_code != 200: | |
| raise HTTPException(status_code=auth_response.status_code, detail="Non autorisé à accéder à l'entrainment du modèle") | |
| else: | |
| user_data = auth_response.json() | |
| user_info = user_data['FirstName']+" "+user_data['LastName'] | |
| if user_data['Authorization'] < 2: | |
| prediction_response = {"message": f"{user_info} n'est pas autorisé à effectuer l'entrainment du modèle"} | |
| return prediction_response | |
| else: | |
| user_info = "un utilisateur inconnu" | |
| with_test = False if input_data.with_test==0 else True | |
| samples_per_class = input_data.samples_per_class | |
| n_epochs = input_data.n_epochs | |
| full_train = input_data.full_train | |
| n_sales_ft = input_data.n_sales_ft | |
| random_state = input_data.random_state if input_data.random_state >= 0 else random.randint(0, 100) | |
| t_debut = time.time() | |
| data_importer = DataImporter(input_data.x_train_path,input_data.y_train_path, input_data.model_path ) | |
| df = data_importer.load_data() | |
| if full_train: | |
| X_train, X_val, X_test, y_train, y_val, y_test = \ | |
| data_importer.split_train_test(df, samples_per_class=samples_per_class, random_state=random_state, with_test=with_test) | |
| else: | |
| X_train, X_val, X_test, y_train, y_val, y_test = \ | |
| data_importer.split_train_test(df, samples_per_class=10, random_state=random_state, with_test=with_test) | |
| df2 = df[-n_sales_ft:] | |
| y_train2 = df2["prdtypecode"] | |
| X_train2 = df2.drop(["prdtypecode"], axis=1) | |
| y_train = pd.concat([y_train,y_train2], axis=0) | |
| X_train = pd.concat([X_train,X_train2], axis=0) | |
| X_train = X_train.reset_index(drop=True) | |
| y_train = y_train.reset_index(drop=True) | |
| print('============================') | |
| print("Final Finetuning Dataset size : ", len(X_train)+len(X_val)+len(X_test)) | |
| print("Final Finetuning Train size : ", len(X_train)) | |
| print("Final Finetuning Val size : ", len(X_val)) | |
| print("Final Finetuning Test size : ", len(X_test)) | |
| print('============================') | |
| samples_per_class = 0 | |
| # Preprocess text and images | |
| text_preprocessor = TextPreprocessor() | |
| image_preprocessor = ImagePreprocessor(input_data.images_path) | |
| text_preprocessor.preprocess_text_in_df(X_train, columns=["description"]) | |
| text_preprocessor.preprocess_text_in_df(X_val, columns=["description"]) | |
| image_preprocessor.preprocess_images_in_df(X_train) | |
| image_preprocessor.preprocess_images_in_df(X_val) | |
| if with_test: | |
| text_preprocessor.preprocess_text_in_df(X_test, columns=["description"]) | |
| image_preprocessor.preprocess_images_in_df(X_test) | |
| # sys.exit(0) | |
| # Train Rnn model | |
| print('============================') | |
| print("Training RNN Model") | |
| text_rnn_model = TextRnnModel(file_path=input_data.model_path) | |
| rnn_history, rnn_best_epoch, rnn_best_f1, rnn_best_accuracy = text_rnn_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs, full_train=full_train) | |
| print("Finished training RNN") | |
| print('============================') | |
| print("Training VGG") | |
| # Train VGG16 model | |
| image_vgg16_model = ImageVGG16Model(file_path=input_data.model_path) | |
| vgg16_history, vgg16_best_epoch, vgg16_best_f1, vgg16_best_accuracy = image_vgg16_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs, full_train=full_train) | |
| print("Finished training VGG") | |
| print('============================') | |
| with open(input_data.model_path+"/tokenizer_config.json", "r", encoding="utf-8") as json_file: | |
| tokenizer_config = json_file.read() | |
| tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config) | |
| rnn = load_model(input_data.model_path,"best_rnn_model.h5") | |
| vgg16 = load_model(input_data.model_path,"best_vgg16_model.h5") | |
| print("Training the concatenate model") | |
| model_concatenate = concatenate(tokenizer, rnn, vgg16) | |
| if (samples_per_class > 0): | |
| new_samples_per_class = min(samples_per_class,50) # max(int(samples_per_class/12),3) # 50 | |
| else: | |
| if full_train: | |
| new_samples_per_class = 50 | |
| else: | |
| new_samples_per_class = 0 | |
| rnn_proba, vgg16_proba, new_y_train = model_concatenate.predict(X_train, y_train, new_samples_per_class=new_samples_per_class, random_state=random_state) | |
| best_weights, best_weighted_f1, best_accuracy, concatenate_train_size = model_concatenate.optimize(rnn_proba, vgg16_proba, new_y_train) | |
| with open(input_data.model_path+"/best_weights.json", "w") as file: | |
| json.dump(best_weights, file) | |
| t_fin = time.time() | |
| training_duration = t_fin - t_debut | |
| print("Training duration = {:.2f} sec".format(training_duration)) | |
| print("Finished training concatenate model") | |
| print('============================') | |
| # Enregistre le modèle au format h5 | |
| # concatenate_model.save(input_data.model_path+"/concatenate.h5") | |
| # Calcul de la perforance sur le dataset test | |
| t_debut = time.time() | |
| t_fin = t_debut | |
| concatenate_test_size = 0 | |
| test_accuracy = 0 | |
| test_f1 = 0 | |
| if with_test: | |
| rnn_proba_test, vgg16_proba_test, new_y_test = model_concatenate.predict(X_test, y_test, new_samples_per_class=0, random_state=random_state) | |
| combined_predictions = (best_weights[0] * rnn_proba_test) + (best_weights[1] * vgg16_proba_test) | |
| final_predictions = np.argmax(combined_predictions, axis=1) | |
| concatenate_test_size = len(new_y_test) | |
| test_accuracy = accuracy_score(new_y_test, final_predictions) | |
| test_f1 = f1_score(new_y_test , final_predictions, average='weighted') | |
| t_fin = time.time() | |
| print('============================') | |
| print("Testing the concatenate model") | |
| print("Test dataset size :", concatenate_test_size) | |
| print("Test: f1 score =", test_f1) | |
| print("Test accuracy score =", test_accuracy) | |
| print("Test duration = {:.2f} sec".format(t_fin - t_debut)) | |
| print('============================') | |
| test_duration = t_fin - t_debut | |
| train_size = int(len(X_train)) | |
| val_size = int(len(X_val)) | |
| test_size = int(len(X_test)) | |
| performances_recap = { | |
| "Date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "Input": { | |
| "epochs_requested": int(n_epochs), | |
| "samples_per_class": int(samples_per_class), | |
| "with_test": int(with_test), | |
| "random_state": int(random_state), | |
| "Dataset_size": { | |
| "Train": train_size, | |
| "Val": val_size, | |
| "Test": test_size | |
| } | |
| }, | |
| "Text" : { | |
| "best_epoch": int(rnn_best_epoch+1), | |
| "f1": float(rnn_best_f1), | |
| "accuracy" : float(rnn_best_accuracy) | |
| }, | |
| "VGG16" : { | |
| "best_epoch": int(vgg16_best_epoch+1), | |
| "f1": float(vgg16_best_f1), | |
| "accuracy" : float(vgg16_best_accuracy) | |
| }, | |
| "Concatenate" : { | |
| "weight": best_weights, | |
| "Train": { | |
| "f1": float(best_weighted_f1), | |
| "accuracy": float(best_accuracy), | |
| "duration" : int(training_duration), | |
| "size": int(concatenate_train_size) | |
| }, | |
| "Test": { | |
| "f1": float(test_f1), | |
| "accuracy": float(test_accuracy), | |
| "duration" : int(test_duration), | |
| "size": concatenate_test_size | |
| } | |
| } | |
| } | |
| with open(input_data.model_path+"/performances.json", "w") as file: | |
| json.dump( performances_recap, file, indent=4) | |
| return {"message": "Entrainement effectuée avec succès"} | |