DIBS / anet_clip /backup /pdvc /UniVL.py
Exclibur's picture
Upload folder using huggingface_hub
f1c1609 verified
import os
import random
import numpy as np
from pathlib import Path
from pdvc.modules.modeling import UniVL
from pdvc.modules.tokenization import BertTokenizer
from transformers import AutoTokenizer, BertForPreTraining
import torch
import argparse
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
Path.home() / '.pytorch_pretrained_bert'))
class UniVL_args(object):
def __init__(self) -> None:
self.do_pretrain = False
self.do_train = False
self.do_eval = True
self.train_csv = 'data/youcookii_singlef_train.csv'
self.val_csv = 'data/youcookii_singlef_val.csv'
self.data_path = 'data/youcookii_caption.pickle'
self.features_path = 'data/youcookii_videos_feature.pickle'
self.num_thread_reader = 1
self.lr = 0.0001
self.epochs = 20
self.batch_size = 256
self.batch_size_val = 3500
self.lr_decay = 0.9
self.n_display = 100
self.video_dim = 1024
self.seed = 42
self.max_words = 48
self.max_frames = 100
self.feature_framerate = 1
self.margin = 0.1
self.hard_negative_rate = 0.5
self.negative_weighting = 1
self.n_pair = 1
self.output_dir = None
self.bert_model = "bert-base-uncased"
self.visual_model = "visual-base"
self.cross_model = "cross-base"
self.decoder_model = "decoder-base"
self.init_model = None
self.do_lower_case = True
self.warmup_proportion = 0.1
self.gradient_accumulation_steps = 1
self.n_gpu = 1
self.cache_dir = ""
self.fp16 = False
self.fp16_opt_level = 'O1'
self.task_type = "retrieval"
self.datatype = "youcook"
self.world_size = 0
self.local_rank = 0
self.coef_lr = 0.1
self.use_mil = False
self.sampled_use_mil = False
self.text_num_hidden_layers = 12
self.visual_num_hidden_layers = 6
self.cross_num_hidden_layers = 2
self.decoder_num_hidden_layers = 3
self.train_sim_after_cross = False
self.expand_msrvtt_sentences = False
self.batch_size = int(self.batch_size / self.gradient_accumulation_steps)
def __repr__(self) -> str:
return str(self.__dict__)
# def get_args(description='UniVL on Retrieval Task'):
# parser = argparse.ArgumentParser(description=description)
# parser.add_argument("--do_pretrain", action='store_true', help="Whether to run training.")
# parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
# parser.add_argument("--do_eval", action='store_true', default=True, help="Whether to run eval on the dev set.")
# parser.add_argument('--train_csv', type=str, default='data/youcookii_singlef_train.csv', help='')
# parser.add_argument('--val_csv', type=str, default='data/youcookii_singlef_val.csv', help='')
# parser.add_argument('--data_path', type=str, default='data/youcookii_caption.pickle', help='data pickle file path')
# parser.add_argument('--features_path', type=str, default='data/youcookii_videos_feature.pickle', help='feature path')
# parser.add_argument('--num_thread_reader', type=int, default=1, help='')
# parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate')
# parser.add_argument('--epochs', type=int, default=20, help='upper epoch limit')
# parser.add_argument('--batch_size', type=int, default=256, help='batch size')
# parser.add_argument('--batch_size_val', type=int, default=3500, help='batch size eval')
# parser.add_argument('--lr_decay', type=float, default=0.9, help='Learning rate exp epoch decay')
# parser.add_argument('--n_display', type=int, default=100, help='Information display frequence')
# parser.add_argument('--video_dim', type=int, default=1024, help='video feature dimension')
# parser.add_argument('--seed', type=int, default=42, help='random seed')
# parser.add_argument('--max_words', type=int, default=20, help='')
# parser.add_argument('--max_frames', type=int, default=100, help='')
# parser.add_argument('--feature_framerate', type=int, default=1, help='')
# parser.add_argument('--margin', type=float, default=0.1, help='margin for loss')
# parser.add_argument('--hard_negative_rate', type=float, default=0.5, help='rate of intra negative sample')
# parser.add_argument('--negative_weighting', type=int, default=1, help='Weight the loss for intra negative')
# parser.add_argument('--n_pair', type=int, default=1, help='Num of pair to output from data loader')
# parser.add_argument("--output_dir", default=None, type=str,
# help="The output directory where the model predictions and checkpoints will be written.")
# parser.add_argument("--bert_model", default="bert-base-uncased", type=str,
# help="Bert pre-trained model")
# parser.add_argument("--visual_model", default="visual-base", type=str, required=False, help="Visual module")
# parser.add_argument("--cross_model", default="cross-base", type=str, required=False, help="Cross module")
# parser.add_argument("--decoder_model", default="decoder-base", type=str, required=False, help="Decoder module")
# parser.add_argument("--init_model", default=None, type=str, required=False, help="Initial model.")
# parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
# parser.add_argument("--warmup_proportion", default=0.1, type=float,
# help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training.")
# parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
# help="Number of updates steps to accumulate before performing a backward/update pass.")
# parser.add_argument('--n_gpu', type=int, default=1, help="Changed in the execute process.")
# parser.add_argument("--cache_dir", default="", type=str,
# help="Where do you want to store the pre-trained models downloaded from s3")
# parser.add_argument('--fp16', action='store_true',
# help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
# parser.add_argument('--fp16_opt_level', type=str, default='O1',
# help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
# "See details at https://nvidia.github.io/apex/amp.html")
# parser.add_argument("--task_type", default="retrieval", type=str, help="Point the task `retrieval` to finetune.")
# parser.add_argument("--datatype", default="youcook", type=str, help="Point the dataset `youcook` to finetune.")
# parser.add_argument("--world_size", default=0, type=int, help="distribted training")
# parser.add_argument("--local_rank", default=0, type=int, help="distribted training")
# parser.add_argument('--coef_lr', type=float, default=0.1, help='coefficient for bert branch.')
# parser.add_argument('--use_mil', action='store_true', help="Whether use MIL as Miech et. al. (2020).")
# parser.add_argument('--sampled_use_mil', action='store_true', help="Whether MIL, has a high priority than use_mil.")
# parser.add_argument('--text_num_hidden_layers', type=int, default=12, help="Layer NO. of text.")
# parser.add_argument('--visual_num_hidden_layers', type=int, default=6, help="Layer NO. of visual.")
# parser.add_argument('--cross_num_hidden_layers', type=int, default=2, help="Layer NO. of cross.")
# parser.add_argument('--decoder_num_hidden_layers', type=int, default=3, help="Layer NO. of decoder.")
# parser.add_argument('--train_sim_after_cross', action='store_true', help="Test retrieval after cross encoder.")
# parser.add_argument('--expand_msrvtt_sentences', action='store_true', help="")
# args = parser.parse_args()
# # Check paramenters
# if args.gradient_accumulation_steps < 1:
# raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
# args.gradient_accumulation_steps))
# if not args.do_train and not args.do_eval:
# raise ValueError("At least one of `do_train` or `do_eval` must be True.")
# args.batch_size = int(args.batch_size / args.gradient_accumulation_steps)
# return args
def set_seed_logger(args):
# predefining random initial seeds
random.seed(args.seed)
os.environ['PYTHONHASHSEED'] = str(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed) # if you are using multi-GPU.
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# world_size = torch.distributed.get_world_size()
# torch.cuda.set_device(args.local_rank)
# args.world_size = world_size
# if not os.path.exists(args.output_dir):
# os.makedirs(args.output_dir, exist_ok=True)
return args
def load_pretrained_UniVL(return_visual_encoder=False):
args = UniVL_args()
args = set_seed_logger(args)
device, n_gpu = 'cuda', 1
init_model = '/cpfs01/user/liuhuabin/PDVC/pdvc/modules/univl.pretrained.bin'
model_state_dict = torch.load(init_model, map_location='cpu')
# Prepare model
cache_dir = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed')
model = UniVL.from_pretrained('bert-base-uncased', 'visual-base', 'cross-base', 'decoder-base',
cache_dir=cache_dir, state_dict=model_state_dict, task_config=args)
model.to(device)
if return_visual_encoder:
return model.bert, model.visual, model.normalize_video
else:
return model.bert
def build_UniVL_tokenizer():
return BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# if __name__ == '__main__':
# device, n_gpu = 'cuda', 1
# captions = ['I love you', 'you believe me']
# tokenizer_hg = AutoTokenizer.from_pretrained("bert-base-uncased")
# text_encoder_hg = tokenizer_hg(captions, return_tensors='pt', truncation=True, padding=True, max_length=20)
# text_encoder_hg = {key: _.to(device) if isinstance(_, torch.Tensor) else _ for key, _ in text_encoder_hg.items()}
# attention_mask = text_encoder_hg['attention_mask']
# args = UniVL_args()
# args = set_seed_logger(args)
# args.init_model = 'modules/univl.pretrained.bin'
# # tokenizer = build_UniVL_tokenizer()
# # input_ids = []
# # for sent in captions:
# # sent = tokenizer.tokenize(sent)
# # sent = ['[CLS]'] + sent + ['[SEP]']
# # input_ids += tokenizer.convert_tokens_to_ids(sent)
# model = load_pretrained_UniVL(args, device, n_gpu, args.local_rank, args.init_model)
# text_embed = model(**text_encoder_hg, output_all_encoded_layers=True)[0][-1]
# breakpoint()
if __name__ == '__main__':
device, n_gpu = 'cuda', 1
args = UniVL_args()
args = set_seed_logger(args)
args.init_model = 'modules/univl.pretrained.bin'
# tokenizer = build_UniVL_tokenizer()
# input_ids = []
# for sent in captions:
# sent = tokenizer.tokenize(sent)
# sent = ['[CLS]'] + sent + ['[SEP]']
# input_ids += tokenizer.convert_tokens_to_ids(sent)
model_bert, model_visual, video_normalizer = load_pretrained_UniVL(args, device, n_gpu, args.local_rank, args.init_model)
inputs = torch.rand(2,215,1024)
video_mask = torch.ones(2,215)
inputs = video_normalizer(inputs)
visual_embed = model_visual(inputs, video_mask, output_all_encoded_layers=True)[0][-1]
breakpoint()