Spaces:
No application file
No application file
| from utilities.utilities_common import * | |
| from config.core import * | |
| from transformers import Seq2SeqTrainer,Seq2SeqTrainingArguments | |
| from transformers import default_data_collator, VisionEncoderDecoderModel | |
| import os | |
| import sys | |
| from pathlib import Path | |
| # os.path.join(os.path.dirname(__file__)) | |
| file = Path(__file__).resolve() | |
| parent, root = file.parent, file.parents[1] | |
| sys.path.append(str(root)) | |
| PACKAGE_ROOT = Path(__file__).resolve().parent | |
| ROOT = PACKAGE_ROOT.parent | |
| CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml" | |
| DATASET_DIR = PACKAGE_ROOT / "dataset" | |
| TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" | |
| CAPTIONS_DIR = DATASET_DIR / "captions.txt" | |
| IMAGES_DIR = DATASET_DIR / "Images" | |
| def run_training(str_image_dir_path, df_train, df_validation, device): | |
| # transform the training and validation dataframes | |
| train_dataset = ImgDataset(df_train, root_dir=str_image_dir_path, tokenizer=tokenizer, feature_extractor=feature_extractor, transform=img_transforms) | |
| validation_dataset = ImgDataset(df_validation, root_dir=str_image_dir_path, tokenizer=tokenizer, feature_extractor=feature_extractor, transform=img_transforms) | |
| print("Encoder : ", config.lmodel_config.ENCODER) | |
| print("Decoder : ", config.lmodel_config.DECODER) | |
| # initialize the model | |
| model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(config.lmodel_config.ENCODER, config.lmodel_config.DECODER) | |
| print("Vocab Size : ", model.config.decoder.vocab_size) | |
| # set model config parameters | |
| model.config.decoder_start_token_id = tokenizer.cls_token_id | |
| model.config.pad_token_id = tokenizer.pad_token_id | |
| # make sure vocab size is set correctly | |
| model.config.vocab_size = model.config.decoder.vocab_size | |
| # set beam search parameters | |
| model.config.eos_token_id = tokenizer.sep_token_id | |
| model.config.decoder_start_token_id = tokenizer.bos_token_id | |
| model.config.max_length = config.lmodel_config.MAX_LEN | |
| model.config.early_stopping = config.lmodel_config.EARLY_STOPPING | |
| model.config.no_repeat_ngram_size = config.lmodel_config.NGRAM_SIZE | |
| model.config.length_penalty = config.lmodel_config.LEN_PENALTY | |
| model.config.num_beams = config.lmodel_config.NUM_BEAMS | |
| # define training arguments | |
| training_args = Seq2SeqTrainingArguments( | |
| output_dir=TRAINED_MODEL_DIR / 'VIT_large_gpt2', | |
| per_device_train_batch_size=config.lmodel_config.TRAIN_BATCH_SIZE, | |
| per_device_eval_batch_size=config.lmodel_config.VAL_BATCH_SIZE, | |
| predict_with_generate=True, | |
| evaluation_strategy="epoch", | |
| do_train=True, | |
| do_eval=True, | |
| logging_steps=config.lmodel_config.NUM_LOGGING_STEPS, | |
| save_steps=2 * config.lmodel_config.NUM_LOGGING_STEPS, | |
| warmup_steps=config.lmodel_config.NUM_LOGGING_STEPS, | |
| learning_rate=5e-5, | |
| max_steps=1500, # delete for full training | |
| num_train_epochs=config.lmodel_config.EPOCHS, # TRAIN_EPOCHS | |
| overwrite_output_dir=True, | |
| save_total_limit=1, | |
| ) | |
| # instantiate trainer | |
| trainer = Seq2SeqTrainer( | |
| tokenizer=feature_extractor, | |
| model=model, | |
| args=training_args, | |
| compute_metrics=compute_metrics, | |
| train_dataset=train_dataset, | |
| eval_dataset=validation_dataset, | |
| data_collator=default_data_collator, | |
| ) | |
| trainer.train() | |
| # # save the trained model | |
| trainer.save_model(TRAINED_MODEL_DIR / 'VIT_large_gpt2') | |
| # print(df_test.iloc[0]["image"]) | |
| print("Image dir : ", IMAGES_DIR) | |
| img = Image.open(IMAGES_DIR / "1000268201_693b08cb0e.jpg").convert("RGB") | |
| img.show() | |
| generated_caption = tokenizer.decode(model.generate(feature_extractor(img, return_tensors="pt").pixel_values.to(device))[0]) | |
| print('\033[96m' +generated_caption+ '\033[0m') |