Spaces:

Demosthene-OR
/

rakuten

Paused

App Files Files Community

rakuten / src /main.py

Demosthene-OR

Configure LFS for images and update code

eb5ec73 3 months ago

raw

history blame contribute delete

8.13 kB

	from features.build_features import DataImporter, TextPreprocessor, ImagePreprocessor
	from models.train_model import TextRnnModel, ImageVGG16Model, concatenate
	from tensorflow import keras
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
	import pickle
	import tensorflow as tf
	import sys
	import json
	import time
	import argparse
	import numpy as np
	from tools import f1_m, load_model
	import sys
	import random
	import datetime


	def main():
	parser = argparse.ArgumentParser(description= "Input data")

	parser.add_argument("--x_train_path", default = "data/preprocessed/X_train_update.csv", type=str,help="File path for the X_train input CSV file.")
	parser.add_argument("--y_train_path", default = "data/preprocessed/Y_train_CVw08PX.csv", type=str,help="File path for the Y_train input CSV file.")
	parser.add_argument("--images_path", default = "data/preprocessed/image_train", type=str, help="Base path for the images.")
	parser.add_argument("--model_path", default = "models", type=str, help="Base path for the models")
	parser.add_argument("--n_epochs", default = 1, type=int, help="Num epochs")
	parser.add_argument("--samples_per_class", default = 50, type=int, help="Num samples per class") # Caution: If samples_per_class==0 , the Full Dataset will be used
	parser.add_argument("--with_test", default = 0, type=int, help="Compute performance on Test dataset")
	parser.add_argument("--random_state", default = 42, type=int, help="random_state")
	args = parser.parse_args()
	with_test = False if args.with_test==0 else True
	samples_per_class = args.samples_per_class
	n_epochs = args.n_epochs
	random_state = args.random_state if args.random_state >= 0 else random.randint(0, 100)

	t_debut = time.time()
	data_importer = DataImporter(args.x_train_path,args.y_train_path, args.model_path )
	df = data_importer.load_data()

	X_train, X_val, X_test, y_train, y_val, y_test = \
	data_importer.split_train_test(df, samples_per_class=samples_per_class, random_state=random_state, with_test=with_test)

	# Preprocess text and images
	text_preprocessor = TextPreprocessor()
	image_preprocessor = ImagePreprocessor(args.images_path)
	text_preprocessor.preprocess_text_in_df(X_train, columns=["description"])
	text_preprocessor.preprocess_text_in_df(X_val, columns=["description"])
	image_preprocessor.preprocess_images_in_df(X_train)
	image_preprocessor.preprocess_images_in_df(X_val)
	if with_test:
	text_preprocessor.preprocess_text_in_df(X_test, columns=["description"])
	image_preprocessor.preprocess_images_in_df(X_test)

	# sys.exit(0)
	# Train Rnn model
	print('============================')
	print("Training RNN Model")
	text_rnn_model = TextRnnModel(file_path=args.model_path)
	rnn_history, rnn_best_f1_epoch, rnn_best_f1, rnn_best_accuracy = text_rnn_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs)
	print("Finished training RNN")

	print('============================')
	print("Training VGG")
	# Train VGG16 model
	image_vgg16_model = ImageVGG16Model(file_path=args.model_path)
	vgg16_history, vgg16_best_f1_epoch, vgg16_best_f1, vgg16_best_accuracy = image_vgg16_model.preprocess_and_fit(X_train, y_train, X_val, y_val, n_epochs=n_epochs)
	print("Finished training VGG")

	print('============================')
	with open(args.model_path+"/tokenizer_config.json", "r", encoding="utf-8") as json_file:
	tokenizer_config = json_file.read()
	tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config)

	rnn = load_model(args.model_path,"best_rnn_model.h5")
	vgg16 = load_model(args.model_path,"best_vgg16_model.h5")

	print("Training the concatenate model")
	model_concatenate = concatenate(tokenizer, rnn, vgg16)
	if (samples_per_class > 0):
	new_samples_per_class = min(samples_per_class,50) # max(int(samples_per_class/12),3) # 50
	else:
	new_samples_per_class = 50

	rnn_proba, vgg16_proba, new_y_train = model_concatenate.predict(X_train, y_train, new_samples_per_class=new_samples_per_class, random_state=random_state)
	best_weights, best_weighted_f1, best_accuracy, concatenate_train_size = model_concatenate.optimize(rnn_proba, vgg16_proba, new_y_train)

	with open(args.model_path+"/best_weights.json", "w") as file:
	json.dump(best_weights, file)

	# with open(args.model_path+"/best_weights.pkl", "wb") as file:
	# pickle.dump(best_weights, file)
	# num_classes = 27

	# proba_rnn = keras.layers.Input(shape=(num_classes,))
	# proba_vgg16 = keras.layers.Input(shape=(num_classes,))

	# weighted_proba = keras.layers.Lambda(
	# lambda x: best_weights[0] * x[0] + best_weights[1] * x[1]
	# )([proba_rnn, proba_vgg16])

	# concatenate_model = keras.models.Model(
	# inputs=[proba_rnn, proba_vgg16], outputs=weighted_proba
	# )

	t_fin = time.time()
	training_duration = t_fin - t_debut
	print("Training duration = {:.2f} sec".format(training_duration))
	print("Finished training concatenate model")
	print('============================')

	# Enregistre le modèle au format h5
	# concatenate_model.save(args.model_path+"/concatenate.h5")

	# Calcul de la perforance sur le dataset test
	t_debut = time.time()
	t_fin = t_debut
	concatenate_test_size = 0
	test_accuracy = 0
	test_f1 = 0
	if with_test:
	rnn_proba_test, vgg16_proba_test, new_y_test = model_concatenate.predict(X_test, y_test, new_samples_per_class=0, random_state=random_state)
	combined_predictions = (best_weights[0] * rnn_proba_test) + (best_weights[1] * vgg16_proba_test)
	final_predictions = np.argmax(combined_predictions, axis=1)
	concatenate_test_size = len(new_y_test)
	test_accuracy = accuracy_score(new_y_test, final_predictions)
	test_f1 = f1_score(new_y_test , final_predictions, average='weighted')
	t_fin = time.time()
	print('============================')
	print("Testing the concatenate model")
	print("Test dataset size :", concatenate_test_size)
	print("Test: f1 score =", test_f1)
	print("Test accuracy score =", test_accuracy)
	print("Test duration = {:.2f} sec".format(t_fin - t_debut))
	print('============================')

	test_duration = t_fin - t_debut
	train_size = int(len(X_train))
	val_size = int(len(X_val))
	test_size = int(len(X_test))
	performances_recap = {
	"Date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"Input": {
	"epochs_requested": int(n_epochs),
	"samples_per_class": int(samples_per_class),
	"with_test": int(with_test),
	"random_state": int(random_state),
	"Dataset_size": {
	"Train": train_size,
	"Val": val_size,
	"Test": test_size
	}
	},
	"Text" : {
	"best_epoch": int(rnn_best_f1_epoch+1),
	"f1": float(rnn_best_f1),
	"accuracy" : float(rnn_best_accuracy)
	},
	"VGG16" : {
	"best_epoch": int(vgg16_best_f1_epoch+1),
	"f1": float(vgg16_best_f1),
	"accuracy" : float(vgg16_best_accuracy)
	},
	"Concatenate" : {
	"weight": best_weights,
	"Train": {
	"f1": float(best_weighted_f1),
	"accuracy": float(best_accuracy),
	"duration" : int(training_duration),
	"size": int(concatenate_train_size)
	},
	"Test": {
	"f1": float(test_f1),
	"accuracy": float(test_accuracy),
	"duration" : int(test_duration),
	"size": concatenate_test_size
	}
	}
	}
	with open(args.model_path+"/performances.json", "w") as file:
	json.dump( performances_recap, file, indent=4)


	if __name__ == "__main__":
	main()