| from email.policy import default |
| import os |
|
|
| import sys |
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| sys.path.append(BASE_DIR) |
|
|
| import cv2 |
|
|
| import argparse |
| import json |
| import os |
| from typing import Any, Dict, List |
|
|
| |
| file_arch = """ |
| ./REFAVS/data |
| - /media |
| - /gt_mask |
| - /metadata.csv |
| - /audio_embed |
| - /image_embed |
| """ |
| |
|
|
| parser = argparse.ArgumentParser( |
| description=( |
| "SimToken" |
| ) |
| ) |
|
|
|
|
|
|
| parser.add_argument("--vision_pretrained",type=str,default='path/to/segment_anything/sam_vit_h_4b8939.pth') |
| parser.add_argument("--vision_tower",type=str,default='openai/clip-vit-large-patch14') |
| parser.add_argument("--mllm",type=str,default='Chat-UniVi/Chat-UniVi-7B-v1.5') |
|
|
| parser.add_argument("--conv_template",type=int,default=1) |
| parser.add_argument("--ct_weight",type=float,default=0.1) |
| parser.add_argument("--input_type",type=str,default='refer') |
| parser.add_argument("--compress",action='store_false',default=True) |
| parser.add_argument("--start",type=int,default=0) |
|
|
|
|
| parser.add_argument("--name",type=str,default='testrun') |
| |
| parser.add_argument("--data_dir",type=str,default='data',help=f"The data paranet dir. File arch should be: {file_arch}") |
| |
| parser.add_argument("--saved_model",type=str,default='trained_simtoken.pth', help="the pretrained simtoken pth") |
|
|
|
|
| parser.add_argument("--log_root",type=str,default='log', help="where to save log during training") |
| parser.add_argument("--checkpoint_root",type=str,default='checkpoints', help="where to save trained checkpoints during training") |
|
|
| parser.add_argument("--visualization_root",type=str,default='visualization', help="where to save visualization result during test") |
| parser.add_argument("--eval_splits",type=str,default='test_s,test_u,test_n', help="comma-separated eval splits for load_model.py: test_s,test_u,test_n") |
|
|
|
|
|
|
|
|
| |
|
|
| |
| parser.add_argument("--lr", type=float, default=5e-5, help='lr to fine tuning adapters.') |
| |
| parser.add_argument("--epochs", type=int, default=10, help='epochs to fine tuning adapters.') |
| parser.add_argument("--batch_size", type=int, default=8) |
|
|
|
|
| parser.add_argument("--gpu_id", type=str, default="0", help="The GPU device to run generation on.") |
|
|
| parser.add_argument("--run", type=str, default='train', help="train, test") |
|
|
| parser.add_argument("--frame_n", type=int, default=10, help="Frame num of each video. Fixed to 10.") |
| parser.add_argument("--text_max_len", type=int, default=25, help="Maximum textual reference length.") |
|
|
|
|
|
|
| args, _ = parser.parse_known_args() |
|
|
| |
| os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id |
| |
|
|