File size: 2,643 Bytes
f4aead5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
import torchvision.transforms as transforms
_base_ = ['../base.py']

config = dict(
    train_config=[
        dict(
        type='Recognition_frame',
        csv_root='/gpfswork/rech/okw/ukw13bv/mmsl/csv/cholec80/csvs',
        vid='video%02d.csv'%i,
        video_root='/gpfsscratch/rech/okw/ukw13bv/cholec80/frames_output',
        transforms=transforms.Compose(
            [
            transforms.Resize((360, 640)),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ]
            ),
        ) for i in range(1, 41)
    ],
    val_config=[
        dict(
        type='Recognition_frame',
        csv_root='/gpfswork/rech/okw/ukw13bv/mmsl/csv/cholec80/csvs',
        vid='video%02d.csv'%i,
        video_root='/gpfsscratch/rech/okw/ukw13bv/cholec80/frames_output',
        transforms=transforms.Compose(
            [
            transforms.Resize((360, 640)),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ]
            ),
        ) for i in range(41, 49)
    ],
    test_config=[
        dict(
        type='Recognition_frame',
        csv_root='/gpfswork/rech/okw/ukw13bv/mmsl/csv/cholec80/csvs',
        vid='video%02d.csv'%i,
        video_root='/gpfsscratch/rech/okw/ukw13bv/cholec80/frames_output',
        transforms=transforms.Compose(
            [
            transforms.Resize((360, 640)),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ]
            ),
        ) for i in range(49, 81)
    ],
    model_config = dict(
        type='MVNet_feature_extractor',
        backbone_img = dict(
            type='img_backbones/ImageEncoder_feature_extractor',
            # type='img_backbones/ImageEncoder_CLIPVISUAL',
            num_classes=768,
            pretrained='imagenet', # imagenet/ssl/random
            backbone_name='resnet_50', 
            # backbone_name='resnet_50_clip' 
            img_norm=False,
        ),
        backbone_text= dict(
            type='text_backbones/BertEncoder',
            text_bert_type='/gpfswork/rech/okw/ukw13bv/mmsl/biobert_pretrain_output_all_notes_150000',
            text_last_n_layers=4,
            text_aggregate_method='sum',
            text_norm=False,
            text_embedding_dim=768,
            text_freeze_bert=False,
            text_agg_tokens=True
        )
    )
)