lart / mvit.yaml
camenduru's picture
thanks to brjathu ❤
8540b52
LOG_MODEL_INFO: False
TRAIN:
DATASET: ava
BATCH_SIZE: 16
BATCH_SIZE: 8
EVAL_PERIOD: 2
CHECKPOINT_PERIOD: 1
AUTO_RESUME: True
CHECKPOINT_EPOCH_RESET: True
CHECKPOINT_IN_INIT: True
CHECKPOINT_FILE_PATH: ""
CHECKPOINT_TYPE: pytorch
CHECKPOINT_EPOCH_RESET: True
DATA:
USE_OFFSET_SAMPLING: True
DECODING_BACKEND: torchvision
NUM_FRAMES: 40
SAMPLING_RATE: 3
TRAIN_JITTER_SCALES: [356, 446]
TRAIN_CROP_SIZE: 312
TEST_CROP_SIZE: 312 # use if TEST.NUM_SPATIAL_CROPS: 1
INPUT_CHANNEL_NUM: [3]
PATH_TO_DATA_DIR: ""
TRAIN_JITTER_SCALES_RELATIVE: [0.08, 1.0]
TRAIN_JITTER_ASPECT_RELATIVE: [0.75, 1.3333]
MEAN: [0.485, 0.456, 0.406]
STD: [0.229, 0.224, 0.225]
MVIT:
ZERO_DECAY_POS_CLS: False
SEP_POS_EMBED: True
DEPTH: 24
NUM_HEADS: 1
EMBED_DIM: 96
PATCH_KERNEL: (3, 7, 7)
PATCH_STRIDE: (2, 4, 4)
PATCH_PADDING: (1, 3, 3)
MLP_RATIO: 4.0
QKV_BIAS: True
DROPPATH_RATE: 0.0
NORM: "layernorm"
EMBED_DIM: 144
NUM_HEADS: 2
DEPTH: 48 # [2, 6, 36, 2]
DIM_MUL: [[2, 2.0], [8, 2.0], [44, 2.0]]
HEAD_MUL: [[2, 2.0], [8, 2.0], [44, 2.0]]
POOL_Q_STRIDE: [[2, 1, 2, 2], [8, 1, 2, 2], [44, 1, 2, 2]]
DROPPATH_RATE: 0.0
POOL_KV_STRIDE_ADAPTIVE: [1, 8, 8]
POOL_KVQ_KERNEL: [3, 3, 3]
USE_ABS_POS: False # default: True
REL_POS_SPATIAL: True # default: false
REL_POS_TEMPORAL: True # default: false
MODE: "conv_unshared"
POOL_Q_STRIDE: [[0, 1, 1, 1], [1, 1, 1, 1], [2, 1, 2, 2], [3, 1, 1, 1], [4, 1, 1, 1], [5, 1, 1, 1], [6, 1, 1, 1], [7, 1, 1, 1], [8, 1, 2, 2], [9, 1, 1, 1], [10, 1, 1, 1],
[11, 1, 1, 1], [12, 1, 1, 1], [13, 1, 1, 1], [14, 1, 1, 1], [15, 1, 1, 1], [16, 1, 1, 1], [17, 1, 1, 1], [18, 1, 1, 1], [19, 1, 1, 1], [20, 1, 1, 1],
[21, 1, 1, 1], [22, 1, 1, 1], [23, 1, 1, 1], [24, 1, 1, 1], [25, 1, 1, 1], [26, 1, 1, 1], [27, 1, 1, 1], [28, 1, 1, 1], [29, 1, 1, 1], [30, 1, 1, 1],
[31, 1, 1, 1], [32, 1, 1, 1], [33, 1, 1, 1], [34, 1, 1, 1], [35, 1, 1, 1], [36, 1, 1, 1], [37, 1, 1, 1], [38, 1, 1, 1], [39, 1, 1, 1], [40, 1, 1, 1],
[41, 1, 1, 1], [42, 1, 1, 1], [43, 1, 1, 1], [44, 1, 2, 2], [45, 1, 1, 1], [46, 1, 1, 1], [47, 1, 1, 1] ]
MODE: "conv"
RESIDUAL_POOLING: True
SEPARATE_QKV: True
CLS_EMBED_ON: False # defauult: True
BN:
USE_PRECISE_STATS: False
NUM_BATCHES_PRECISE: 200
DETECTION:
ENABLE: True
ALIGNED: True
SPATIAL_SCALE_FACTOR: 32
AVA:
BGR: False
DETECTION_SCORE_THRESH: 0.9
TRAIN_PREDICT_BOX_LISTS: [
"ava_train_v2.2.csv",
"person_box_67091280_iou90/ava_detection_train_boxes_and_labels_include_negative_v2.2.csv",
]
TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"]
ANNOTATION_DIR: /datasets01/AVA/080720/frame_list/
FRAME_LIST_DIR: /datasets01/AVA/080720/frame_list/
FRAME_DIR: /datasets01/AVA/080720/frames/
FULL_TEST_ON_VAL: True
SOLVER:
CLIP_GRAD_L2NORM: 2.0
ZERO_WD_1D_PARAM: True
BASE_LR_SCALE_NUM_SHARDS: True
BASE_LR: 0.075
COSINE_AFTER_WARMUP: True
COSINE_END_LR: 1e-7
WARMUP_START_LR: 1e-8
WARMUP_EPOCHS: 5.0
LR_POLICY: cosine
MAX_EPOCH: 20
MOMENTUM: 0.9
WEIGHT_DECAY: 1e-8
OPTIMIZING_METHOD: sgd
MODEL:
NUM_CLASSES: 80
HEAD_ACT: sigmoid
# NUM_CLASSES: 600
ARCH: mvit
MODEL_NAME: MViT
LOSS_FUNC: bce # soft_cross_entropy # default cross_entropy
DROPOUT_RATE: 0.0
ACT_CHECKPOINT: True # for test flops
TEST:
ENABLE: True
DATASET: ava
BATCH_SIZE: 1
NUM_SPATIAL_CROPS: 1
CHECKPOINT_FILE_PATH: /home/jathu/mvit.pyth
DATA_LOADER:
NUM_WORKERS: 4
PIN_MEMORY: True
NUM_GPUS: 1
NUM_SHARDS: 1
SHARD_ID: 0
RNG_SEED: 0
OUTPUT_DIR: .
DEMO:
ENABLE: True
LABEL_FILE_PATH: /private/home/jathushan/3D/slowfast/ava_names.json
WEBCAM: -1
INPUT_VIDEO: /private/home/jathushan/datasets/ttv/webm2/82FE8F069F1354550003607470080_1fcf1757309.4.7.mp4
OUTPUT_FILE: output.mp4
# #dbg
# DATA_LOADER:
# NUM_WORKERS: 0
# NUM_GPUS: 1