Spaces:

Demosthene-OR
/

rakuten

Paused

File size: 9,621 Bytes

eb5ec73

from fastapi import FastAPI, Depends, HTTPException, status
from fastapi.security import OAuth2PasswordBearer
from pydantic import BaseModel
from typing import List, Optional
import requests
import asyncio

from src.features.build_features import DataImporter, TextPreprocessor, ImagePreprocessor
from src.models.train_model_API import TextRnnModel, ImageVGG16Model, concatenate
from tensorflow import keras
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
import tensorflow as tf
import sys
import json
import time
import numpy as np
import pandas as pd
from src.tools import f1_m, load_model
import sys
import random
import datetime

# Instanciate your FastAPI app
app = FastAPI()
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")

class TrainInput(BaseModel):
    x_train_path: Optional[str] = "data/preprocessed/X_train_update.csv"
    y_train_path: Optional[str] = "data/preprocessed/Y_train_CVw08PX.csv"
    images_path: Optional[str] = "data/preprocessed/image_train"
    model_path: Optional[str] = "models"
    n_epochs: Optional[int] = 1
    samples_per_class: Optional[int] = 50  # Caution: If samples_per_class==0 , the Full Dataset will be used
    with_test: Optional[bool] = False
    random_state: Optional[int] = 42
    full_train: Optional[bool] = True
    n_sales_ft: Optional[int] = 50
    api_secured: Optional[bool] = False

@app.post("/train")
async def main(input_data: TrainInput, token: Optional[str] = Depends(oauth2_scheme)):
    
    # Si api_secured est True, vérifiez les crédentiels
    if input_data.api_secured:
        auth_response = requests.get("http://api_oauth:8001/secured", headers={"Authorization": f"Bearer {token}"})
        if auth_response.status_code != 200:
            raise HTTPException(status_code=auth_response.status_code, detail="Non autorisé à accéder à l'entrainment du modèle")
        else:
            user_data = auth_response.json()
            user_info = user_data['FirstName']+" "+user_data['LastName']
            if user_data['Authorization'] < 2:
                prediction_response = {"message": f"{user_info} n'est pas autorisé à effectuer l'entrainment du modèle"}
                return prediction_response
    else:
        user_info = "un utilisateur inconnu"
    
    with_test = False if input_data.with_test==0 else True
    samples_per_class = input_data.samples_per_class
    n_epochs = input_data.n_epochs
    full_train = input_data.full_train
    n_sales_ft = input_data.n_sales_ft
    random_state = input_data.random_state if input_data.random_state >= 0 else random.randint(0, 100)

    t_debut = time.time()
    data_importer = DataImporter(input_data.x_train_path,input_data.y_train_path, input_data.model_path )
    df = data_importer.load_data()
    
    if full_train:
        X_train, X_val, X_test, y_train, y_val, y_test = \
            data_importer.split_train_test(df, samples_per_class=samples_per_class, random_state=random_state, with_test=with_test) 
    else:
        X_train, X_val, X_test, y_train, y_val, y_test = \
            data_importer.split_train_test(df, samples_per_class=10, random_state=random_state, with_test=with_test) 
        df2 = df[-n_sales_ft:]
        y_train2 = df2["prdtypecode"]
        X_train2 = df2.drop(["prdtypecode"], axis=1)
        y_train = pd.concat([y_train,y_train2], axis=0)
        X_train = pd.concat([X_train,X_train2], axis=0)
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        print('============================')
        print("Final Finetuning Dataset size : ", len(X_train)+len(X_val)+len(X_test))
        print("Final Finetuning Train size   : ", len(X_train))
        print("Final Finetuning Val size     : ", len(X_val))
        print("Final Finetuning Test size    : ", len(X_test))
        print('============================')
        samples_per_class = 0    

    # Preprocess text and images
    text_preprocessor = TextPreprocessor()
    image_preprocessor = ImagePreprocessor(input_data.images_path)
    text_preprocessor.preprocess_text_in_df(X_train, columns=["description"])
    text_preprocessor.preprocess_text_in_df(X_val, columns=["description"])
    image_preprocessor.preprocess_images_in_df(X_train)
    image_preprocessor.preprocess_images_in_df(X_val)
    if with_test:
        text_preprocessor.preprocess_text_in_df(X_test, columns=["description"])
        image_preprocessor.preprocess_images_in_df(X_test)

    # sys.exit(0)
    # Train Rnn model
    print('============================')
    print("Training RNN Model")
    text_rnn_model = TextRnnModel(file_path=input_data.model_path)
    rnn_history, rnn_best_epoch, rnn_best_f1, rnn_best_accuracy = text_rnn_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs, full_train=full_train)
    print("Finished training RNN")
    
    print('============================')
    print("Training VGG")
    # Train VGG16 model
    image_vgg16_model = ImageVGG16Model(file_path=input_data.model_path)
    vgg16_history, vgg16_best_epoch, vgg16_best_f1, vgg16_best_accuracy = image_vgg16_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs, full_train=full_train)
    print("Finished training VGG")
    
    print('============================')
    with open(input_data.model_path+"/tokenizer_config.json", "r", encoding="utf-8") as json_file:
        tokenizer_config = json_file.read()
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config)

    rnn = load_model(input_data.model_path,"best_rnn_model.h5")
    vgg16 = load_model(input_data.model_path,"best_vgg16_model.h5")
           
    print("Training the concatenate model")
    model_concatenate = concatenate(tokenizer, rnn, vgg16)
    if (samples_per_class > 0):
        new_samples_per_class = min(samples_per_class,50)  # max(int(samples_per_class/12),3) # 50
    else:
        if full_train:
            new_samples_per_class = 50
        else:
            new_samples_per_class = 0

    rnn_proba, vgg16_proba, new_y_train = model_concatenate.predict(X_train, y_train, new_samples_per_class=new_samples_per_class, random_state=random_state)  
    best_weights, best_weighted_f1, best_accuracy, concatenate_train_size = model_concatenate.optimize(rnn_proba, vgg16_proba, new_y_train)

    with open(input_data.model_path+"/best_weights.json", "w") as file:
        json.dump(best_weights, file)
          
    t_fin = time.time()
    training_duration = t_fin - t_debut
    print("Training duration = {:.2f} sec".format(training_duration))
    print("Finished training concatenate model")
    print('============================')
    
    # Enregistre le modèle au format h5
    # concatenate_model.save(input_data.model_path+"/concatenate.h5")
    
    # Calcul de la perforance sur le dataset test
    t_debut = time.time()
    t_fin = t_debut
    concatenate_test_size = 0
    test_accuracy = 0
    test_f1 = 0
    if with_test:
        rnn_proba_test, vgg16_proba_test, new_y_test = model_concatenate.predict(X_test, y_test, new_samples_per_class=0, random_state=random_state) 
        combined_predictions = (best_weights[0] * rnn_proba_test) + (best_weights[1] * vgg16_proba_test)
        final_predictions = np.argmax(combined_predictions, axis=1)
        concatenate_test_size = len(new_y_test)
        test_accuracy = accuracy_score(new_y_test, final_predictions)
        test_f1 = f1_score(new_y_test , final_predictions, average='weighted')
        t_fin = time.time()
        print('============================')
        print("Testing the concatenate model")
        print("Test dataset size :", concatenate_test_size)
        print("Test: f1 score =", test_f1)
        print("Test accuracy score =", test_accuracy)
        print("Test duration = {:.2f} sec".format(t_fin - t_debut))
        print('============================')
    
    test_duration = t_fin - t_debut
    train_size = int(len(X_train))
    val_size = int(len(X_val))
    test_size = int(len(X_test))
    performances_recap = {
        "Date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "Input": {
            "epochs_requested": int(n_epochs),
            "samples_per_class": int(samples_per_class),
            "with_test": int(with_test),
            "random_state": int(random_state),
            "Dataset_size": {
                "Train": train_size,
                "Val": val_size,
                "Test": test_size
                }    
        },
        "Text" : {
            "best_epoch": int(rnn_best_epoch+1), 
            "f1": float(rnn_best_f1),
            "accuracy" : float(rnn_best_accuracy)
        },
        "VGG16" : {
            "best_epoch": int(vgg16_best_epoch+1), 
            "f1": float(vgg16_best_f1),
            "accuracy" : float(vgg16_best_accuracy)
        },
        "Concatenate" : {
            "weight": best_weights,
            "Train": {
                "f1": float(best_weighted_f1),
                "accuracy": float(best_accuracy),
                "duration" : int(training_duration),
                "size": int(concatenate_train_size)
            },
            "Test": {
                "f1": float(test_f1),
                "accuracy": float(test_accuracy),
                "duration" : int(test_duration),
                "size": concatenate_test_size
            }
        }
    }
    with open(input_data.model_path+"/performances.json", "w") as file:
        json.dump( performances_recap, file, indent=4)
        
    return {"message": "Entrainement effectuée avec succès"}